diff --git a/posix.mak b/posix.mak index 8b9f55ea391..5ce6959a644 100644 --- a/posix.mak +++ b/posix.mak @@ -192,7 +192,7 @@ PACKAGE_std_experimental_ndslice = package iteration selection slice PACKAGE_std_net = curl isemail PACKAGE_std_range = interfaces package primitives PACKAGE_std_regex = package $(addprefix internal/,generator ir parser \ - backtracking kickstart tests thompson) + backtracking bitnfa tests tests2 tests3 thompson shiftor) # Modules in std (including those in packages) STD_MODULES=$(call P2MODULES,$(STD_PACKAGES)) diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d index a7c360c5e88..fb3b357ec67 100644 --- a/std/regex/internal/backtracking.d +++ b/std/regex/internal/backtracking.d @@ -30,7 +30,13 @@ template BacktrackingMatcher(bool CTregex) alias String = const(Char)[]; alias RegEx = Regex!Char; alias MatchFn = bool function (ref BacktrackingMatcher!(Char, Stream)); - RegEx re; //regex program + const(Bytecode)[] ir; + uint ngroup; + uint flags; + const(Interval[])[] charsets; + const(CharMatcher)[] matchers; + const(BitTable)[] filters; + const Kickstart!Char kickstart; static if (CTregex) MatchFn nativeFn; //native code for that program //Stream state @@ -79,12 +85,17 @@ template BacktrackingMatcher(bool CTregex) static size_t initialMemory(const ref RegEx re) { - return stackSize(re)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof; + return stackSize(re.ngroup)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof; } - static size_t stackSize(const ref RegEx re) + size_t initialMemory() { - return initialStack*(stateSize + re.ngroup*(Group!DataIndex).sizeof/size_t.sizeof)+1; + return stackSize(ngroup)*size_t.sizeof + merge.length*Trace.sizeof; + } + + static size_t stackSize(uint ngroup) + { + return initialStack*(stateSize + ngroup*(Group!DataIndex).sizeof/size_t.sizeof)+1; } @property bool atStart(){ return index == 0; } @@ -101,7 +112,7 @@ template BacktrackingMatcher(bool CTregex) { static if (kicked) { - if (!s.search(re.kickstart, front, index)) + if (!s.search(kickstart, front, index)) { index = s.lastIndex; } @@ -113,46 +124,69 @@ template BacktrackingMatcher(bool CTregex) // void newStack() { - auto chunk = mallocArray!(size_t)(stackSize(re)); + auto chunk = mallocArray!(size_t)(stackSize(ngroup)); chunk[0] = cast(size_t)(memory.ptr); memory = chunk[1..$]; } - void initExternalMemory(void[] memBlock) + void initExternalMemory(void[] memBlock, size_t hotspotTableSize) { - merge = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock); + merge = arrayInChunk!(Trace)(hotspotTableSize, memBlock); merge[] = Trace.init; memory = cast(size_t[])memBlock; memory[0] = 0; //hidden pointer memory = memory[1..$]; } - void initialize(ref RegEx program, Stream stream, void[] memBlock) + void dupTo(void[] memory) { - re = program; - s = stream; - exhausted = false; - initExternalMemory(memBlock); - backrefed = null; + initExternalMemory(memory, merge.length); } - auto dupTo(void[] memory) + this(Matcher)(ref Matcher matcher, Stream stream, void[] memBlock, dchar ch, DataIndex idx) { - typeof(this) tmp = this; - tmp.initExternalMemory(memory); - return tmp; + ir = matcher.ir; + charsets = matcher.charsets; + filters = matcher.filters; + matchers = matcher.matchers; + ngroup = matcher.ngroup; + flags = matcher.flags; + s = stream; + exhausted = false; + initExternalMemory(memBlock, matcher.merge.length); + backrefed = null; + front = ch; + index = idx; } - this(ref RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx) + this(Matcher)(ref Matcher matcher, Stream stream, void[] memBlock) { - initialize(program, stream, memBlock); - front = ch; - index = idx; + ir = matcher.ir; + charsets = matcher.charsets; + filters = matcher.filters; + matchers = matcher.matchers; + ngroup = matcher.ngroup; + flags = matcher.flags; + s = stream; + exhausted = false; + initExternalMemory(memBlock, matcher.merge.length); + backrefed = null; + next(); } - this(ref RegEx program, Stream stream, void[] memBlock) + this()(ref const RegEx program, Stream stream, void[] memBlock, uint regexFlags) { - initialize(program, stream, memBlock); + kickstart = program.kickstart; + ir = program.ir; + charsets = program.charsets; + filters = program.filters; + matchers = program.matchers; + ngroup = program.ngroup; + flags = regexFlags; + s = stream; + exhausted = false; + initExternalMemory(memBlock, program.hotspotTableSize); + backrefed = null; next(); } @@ -160,7 +194,7 @@ template BacktrackingMatcher(bool CTregex) { alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); alias BackMatcher = BackMatcherTempl!(Char, Stream); - auto fwdMatcher = BackMatcher(matcher.re, s, memBlock, front, index); + auto fwdMatcher = BackMatcher(matcher, s, memBlock, front, index); return fwdMatcher; } @@ -169,7 +203,7 @@ template BacktrackingMatcher(bool CTregex) alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); alias BackMatcher = BackMatcherTempl!(Char, typeof(s.loopBack(index))); auto fwdMatcher = - BackMatcher(matcher.re, s.loopBack(index), memBlock); + BackMatcher(matcher, s.loopBack(index), memBlock); return fwdMatcher; } @@ -182,7 +216,7 @@ template BacktrackingMatcher(bool CTregex) {//stream is updated here matches[0].begin = start; matches[0].end = index; - if (!(re.flags & RegexOption.global) || atEnd) + if (!(flags & RegexOption.global) || atEnd) exhausted = true; if (start == index)//empty match advances input next(); @@ -202,7 +236,7 @@ template BacktrackingMatcher(bool CTregex) if (exhausted) //all matches collected return false; this.matches = matches; - if (re.flags & RegexInfo.oneShot) + if (flags & RegexInfo.oneShot) { exhausted = true; const DataIndex start = index; @@ -216,7 +250,7 @@ template BacktrackingMatcher(bool CTregex) } static if (kicked) { - if (!re.kickstart.empty) + if (kickstart) { for (;;) { @@ -225,6 +259,7 @@ template BacktrackingMatcher(bool CTregex) return val; else { + import std.stdio; if (atEnd) break; search(); @@ -284,19 +319,19 @@ template BacktrackingMatcher(bool CTregex) { debug(std_regex_matcher) writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s", - pc, counter, disassemble(re.ir, pc, re.dict), + pc, counter, disassemble(ir, pc), front, s._index); - switch (re.ir[pc].code) + switch (ir[pc].code) { case IR.OrChar://assumes IRL!(OrChar) == 1 if (atEnd) goto L_backtrack; - uint len = re.ir[pc].sequence; + uint len = ir[pc].sequence; uint end = pc + len; - if (re.ir[pc].data != front && re.ir[pc+1].data != front) + if (ir[pc].data != front && ir[pc+1].data != front) { for (pc = pc+2; pc < end; pc++) - if (re.ir[pc].data == front) + if (ir[pc].data == front) break; if (pc == end) goto L_backtrack; @@ -305,7 +340,7 @@ template BacktrackingMatcher(bool CTregex) next(); break; case IR.Char: - if (atEnd || front != re.ir[pc].data) + if (atEnd || front != ir[pc].data) goto L_backtrack; pc += IRL!(IR.Char); next(); @@ -317,13 +352,13 @@ template BacktrackingMatcher(bool CTregex) next(); break; case IR.CodepointSet: - if (atEnd || !re.charsets[re.ir[pc].data].scanFor(front)) + if (atEnd || !charsets[ir[pc].data].scanFor(front)) goto L_backtrack; next(); pc += IRL!(IR.CodepointSet); break; case IR.Trie: - if (atEnd || !re.matchers[re.ir[pc].data][front]) + if (atEnd || !matchers[ir[pc].data][front]) goto L_backtrack; next(); pc += IRL!(IR.Trie); @@ -411,10 +446,10 @@ template BacktrackingMatcher(bool CTregex) goto L_backtrack; break; case IR.InfiniteStart, IR.InfiniteQStart: - pc += re.ir[pc].data + IRL!(IR.InfiniteStart); + pc += ir[pc].data + IRL!(IR.InfiniteStart); //now pc is at end IR.Infinite(Q)End - uint len = re.ir[pc].data; - if (re.ir[pc].code == IR.InfiniteEnd) + uint len = ir[pc].data; + if (ir[pc].code == IR.InfiniteEnd) { pushState(pc+IRL!(IR.InfiniteEnd), counter); pc -= len; @@ -426,29 +461,29 @@ template BacktrackingMatcher(bool CTregex) } break; case IR.InfiniteBloomStart: - pc += re.ir[pc].data + IRL!(IR.InfiniteBloomStart); + pc += ir[pc].data + IRL!(IR.InfiniteBloomStart); //now pc is at end IR.InfiniteBloomEnd - immutable len = re.ir[pc].data; - immutable filterIdx = re.ir[pc+2].raw; - if (re.filters[filterIdx][front]) + immutable len = ir[pc].data; + immutable filterIdx = ir[pc+2].raw; + if (filters[filterIdx][front]) pushState(pc+IRL!(IR.InfiniteBloomEnd), counter); pc -= len; break; case IR.RepeatStart, IR.RepeatQStart: - pc += re.ir[pc].data + IRL!(IR.RepeatStart); + pc += ir[pc].data + IRL!(IR.RepeatStart); break; case IR.RepeatEnd: case IR.RepeatQEnd: - if (merge[re.ir[pc + 1].raw+counter].mark(index)) + if (merge[ir[pc + 1].raw+counter].mark(index)) { // merged! goto L_backtrack; } //len, step, min, max - immutable len = re.ir[pc].data; - immutable step = re.ir[pc+2].raw; - immutable min = re.ir[pc+3].raw; - immutable max = re.ir[pc+4].raw; + immutable len = ir[pc].data; + immutable step = ir[pc+2].raw; + immutable min = ir[pc+3].raw; + immutable max = ir[pc+4].raw; if (counter < min) { counter += step; @@ -456,7 +491,7 @@ template BacktrackingMatcher(bool CTregex) } else if (counter < max) { - if (re.ir[pc].code == IR.RepeatEnd) + if (ir[pc].code == IR.RepeatEnd) { pushState(pc + IRL!(IR.RepeatEnd), counter%step); counter += step; @@ -478,13 +513,13 @@ template BacktrackingMatcher(bool CTregex) case IR.InfiniteEnd: case IR.InfiniteQEnd: debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); - if (merge[re.ir[pc + 1].raw+counter].mark(index)) + if (merge[ir[pc + 1].raw+counter].mark(index)) { // merged! goto L_backtrack; } - immutable len = re.ir[pc].data; - if (re.ir[pc].code == IR.InfiniteEnd) + immutable len = ir[pc].data; + if (ir[pc].code == IR.InfiniteEnd) { pushState(pc + IRL!(IR.InfiniteEnd), counter); pc -= len; @@ -497,14 +532,14 @@ template BacktrackingMatcher(bool CTregex) break; case IR.InfiniteBloomEnd: debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); - if (merge[re.ir[pc + 1].raw+counter].mark(index)) + if (merge[ir[pc + 1].raw+counter].mark(index)) { // merged! goto L_backtrack; } - immutable len = re.ir[pc].data; - immutable filterIdx = re.ir[pc+2].raw; - if (re.filters[filterIdx][front]) + immutable len = ir[pc].data; + immutable filterIdx = ir[pc+2].raw; + if (filters[filterIdx][front]) { infiniteNesting--; pushState(pc + IRL!(IR.InfiniteBloomEnd), counter); @@ -513,7 +548,7 @@ template BacktrackingMatcher(bool CTregex) pc -= len; break; case IR.OrEnd: - if (merge[re.ir[pc + 1].raw+counter].mark(index)) + if (merge[ir[pc + 1].raw+counter].mark(index)) { // merged! goto L_backtrack; @@ -524,34 +559,34 @@ template BacktrackingMatcher(bool CTregex) pc += IRL!(IR.OrStart); goto case; case IR.Option: - immutable len = re.ir[pc].data; - if (re.ir[pc+len].code == IR.GotoEndOr)//not a last one + immutable len = ir[pc].data; + if (ir[pc+len].code == IR.GotoEndOr)//not a last one { pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch } pc += IRL!(IR.Option); break; case IR.GotoEndOr: - pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr); + pc = pc + ir[pc].data + IRL!(IR.GotoEndOr); break; case IR.GroupStart: - immutable n = re.ir[pc].data; + immutable n = ir[pc].data; matches[n].begin = index; debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index); pc += IRL!(IR.GroupStart); break; case IR.GroupEnd: - immutable n = re.ir[pc].data; + immutable n = ir[pc].data; matches[n].end = index; debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index); pc += IRL!(IR.GroupEnd); break; case IR.LookaheadStart: case IR.NeglookaheadStart: - immutable len = re.ir[pc].data; + immutable len = ir[pc].data; auto save = index; - immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; - auto mem = malloc(initialMemory(re))[0..initialMemory(re)]; + immutable ms = ir[pc+1].raw, me = ir[pc+2].raw; + auto mem = malloc(initialMemory())[0..initialMemory()]; scope(exit) free(mem.ptr); static if (Stream.isLoopback) { @@ -563,10 +598,10 @@ template BacktrackingMatcher(bool CTregex) } matcher.matches = matches[ms .. me]; matcher.backrefed = backrefed.empty ? matches : backrefed; - matcher.re.ir = re.ir[ + matcher.ir = ir[ pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd) ]; - immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart); + immutable match = (matcher.matchImpl() != 0) ^ (ir[pc].code == IR.NeglookaheadStart); s.reset(save); next(); if (!match) @@ -578,26 +613,26 @@ template BacktrackingMatcher(bool CTregex) break; case IR.LookbehindStart: case IR.NeglookbehindStart: - immutable len = re.ir[pc].data; - immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; - auto mem = malloc(initialMemory(re))[0..initialMemory(re)]; + immutable len = ir[pc].data; + immutable ms = ir[pc+1].raw, me = ir[pc+2].raw; + auto mem = malloc(initialMemory())[0..initialMemory()]; scope(exit) free(mem.ptr); static if (Stream.isLoopback) { alias Matcher = BacktrackingMatcher!(Char, Stream); - auto matcher = Matcher(re, s, mem, front, index); + auto matcher = Matcher(this, s, mem, front, index); } else { alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index))); - auto matcher = Matcher(re, s.loopBack(index), mem); + auto matcher = Matcher(this, s.loopBack(index), mem); } matcher.matches = matches[ms .. me]; - matcher.re.ir = re.ir[ + matcher.ir = ir[ pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd) ]; matcher.backrefed = backrefed.empty ? matches : backrefed; - immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart); + immutable match = (matcher.matchImpl() != 0) ^ (ir[pc].code == IR.NeglookbehindStart); if (!match) goto L_backtrack; else @@ -606,8 +641,8 @@ template BacktrackingMatcher(bool CTregex) } break; case IR.Backref: - immutable n = re.ir[pc].data; - auto referenced = re.ir[pc].localRef + immutable n = ir[pc].data; + auto referenced = ir[pc].localRef ? s[matches[n].begin .. matches[n].end] : s[backrefed[n].begin .. backrefed[n].end]; while (!atEnd && !referenced.empty && front == referenced.front) @@ -628,9 +663,9 @@ template BacktrackingMatcher(bool CTregex) case IR.LookbehindEnd: case IR.NeglookbehindEnd: case IR.End: - return re.ir[pc].data; + return ir[pc].data; default: - debug printBytecode(re.ir[0..$]); + debug printBytecode(ir[0..$]); assert(0); L_backtrack: if (!popState()) @@ -659,7 +694,7 @@ template BacktrackingMatcher(bool CTregex) { import core.stdc.stdlib : free; free(memory.ptr);//last segment is freed in RegexMatch - immutable size = initialStack*(stateSize + 2*re.ngroup); + immutable size = initialStack*(stateSize + 2*ngroup); memory = prev[0..size]; lastState = size; return true; @@ -794,7 +829,7 @@ struct CtContext //to mark the portion of matches to save int match, total_matches; int reserved; - CodepointSet[] charsets; + const Interval[][] charsets; //state of codegenerator @@ -804,7 +839,7 @@ struct CtContext int addr; } - this(Char)(Regex!Char re) + this(Char)(const Regex!Char re) { match = 1; reserved = 1; //first match is skipped @@ -866,7 +901,7 @@ struct CtContext } // - CtState ctGenBlock(Bytecode[] ir, int addr) + CtState ctGenBlock(const(Bytecode)[] ir, int addr) { CtState result; result.addr = addr; @@ -880,7 +915,7 @@ struct CtContext } // - CtState ctGenGroup(ref Bytecode[] ir, int addr) + CtState ctGenGroup(ref const(Bytecode)[] ir, int addr) { import std.algorithm.comparison : max; auto bailOut = "goto L_backtrack;"; @@ -943,7 +978,7 @@ struct CtContext //(neg)lookaround piece ends } auto save = index; - auto mem = malloc(initialMemory(re))[0..initialMemory(re)]; + auto mem = malloc(initialMemory())[0..initialMemory()]; scope(exit) free(mem.ptr); static if (typeof(matcher.s).isLoopback) auto lookaround = $$; @@ -982,7 +1017,7 @@ struct CtContext } //generate source for bytecode contained in OrStart ... OrEnd - CtState ctGenAlternation(Bytecode[] ir, int addr) + CtState ctGenAlternation(const(Bytecode)[] ir, int addr) { CtState[] pieces; CtState r; @@ -1022,11 +1057,11 @@ struct CtContext // generate fixup code for instruction in ir, // fixup means it has an alternative way for control flow - string ctGenFixupCode(Bytecode[] ir, int addr, int fixup) + string ctGenFixupCode(const(Bytecode)[] ir, int addr, int fixup) { return ctGenFixupCode(ir, addr, fixup); // call ref Bytecode[] version } - string ctGenFixupCode(ref Bytecode[] ir, int addr, int fixup) + string ctGenFixupCode(ref const(Bytecode)[] ir, int addr, int fixup) { string r; string testCode; @@ -1180,7 +1215,7 @@ struct CtContext } - string ctQuickTest(Bytecode[] ir, int id) + string ctQuickTest(const(Bytecode)[] ir, int id) { uint pc = 0; while (pc < ir.length && ir[pc].isAtom) @@ -1207,7 +1242,7 @@ struct CtContext } //process & generate source for simple bytecodes at front of ir using address addr - CtState ctGenAtom(ref Bytecode[] ir, int addr) + CtState ctGenAtom(ref const(Bytecode)[] ir, int addr) { CtState result; result.code = ctAtomCode(ir, addr); @@ -1217,7 +1252,7 @@ struct CtContext } //D code for atom at ir using address addr, addr < 0 means quickTest - string ctAtomCode(Bytecode[] ir, int addr) + string ctAtomCode(const(Bytecode)[] ir, int addr) { string code; string bailOut, nextInstr; @@ -1262,7 +1297,7 @@ struct CtContext break; case IR.Any: code ~= ctSub( ` - if (atEnd || (!(re.flags & RegexOption.singleline) + if (atEnd || (!(flags & RegexOption.singleline) && (front == '\r' || front == '\n'))) $$ $$ @@ -1272,7 +1307,7 @@ struct CtContext if (charsets.length) { string name = `func_`~to!string(addr+1); - string funcCode = charsets[ir[0].data].toSourceCode(name); + string funcCode = CodepointSet(charsets[ir[0].data]).toSourceCode(name); code ~= ctSub( ` static $$ if (atEnd || !$$(front)) @@ -1282,16 +1317,16 @@ struct CtContext } else code ~= ctSub( ` - if (atEnd || !re.charsets[$$].scanFor(front)) + if (atEnd || !charsets[$$].scanFor(front)) $$ $$ $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); break; case IR.Trie: - if (charsets.length && charsets[ir[0].data].byInterval.length <= 8) + if (charsets.length && charsets[ir[0].data].length <= 8) goto case IR.CodepointSet; code ~= ctSub( ` - if (atEnd || !re.matchers[$$][front]) + if (atEnd || !matchers[$$][front]) $$ $$ $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); @@ -1429,7 +1464,7 @@ struct CtContext } //generate D code for the whole regex - public string ctGenRegEx(Bytecode[] ir) + public string ctGenRegEx(const(Bytecode)[] ir) { auto bdy = ctGenBlock(ir, 0); auto r = ` @@ -1475,7 +1510,7 @@ struct CtContext } -string ctGenRegExCode(Char)(Regex!Char re) +string ctGenRegExCode(Char)(const Regex!Char re) { auto context = CtContext(re); return context.ctGenRegEx(re.ir); diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d new file mode 100644 index 00000000000..2dd0b8f0d41 --- /dev/null +++ b/std/regex/internal/bitnfa.d @@ -0,0 +1,741 @@ +//Written in the D programming language +/* + Implementation of a concept "NFA in a word" which is + bit-parallel impementation of regex where each bit represents + a state in an NFA. Execution is Thompson-style achieved via bit tricks. + + There is a great number of limitations inlcuding not tracking any state (captures) + and not supporting even basic assertions such as ^, $ or \b. +*/ +module std.regex.internal.bitnfa; + +package(std.regex): + +import std.regex.internal.ir; + +debug(std_regex_bitnfa) import std.stdio; +import std.algorithm; + + +struct HashTab +{ +pure: + @disable this(this); + + uint opIndex()(uint key) const + { + auto p = locate(key, table); + assert(p.occupied); + return p.value; + } + + bool opBinaryRight(string op:"in")(uint key) const + { + auto p = locate(key, table); + return p.occupied; + } + + void opIndexAssign(uint value, uint key) + { + if (table.length == 0) grow(); + auto p = locate(key, table); + if (!p.occupied) + { + items++; + if (4 * items >= table.length * 3) + { + grow(); + p = locate(key, table); + } + p.key_ = key; + p.setOccupied(); + } + p.value = value; + } + + auto keys() const + { + import std.array : appender; + auto app = appender!(uint[])(); + foreach (i, v; table) + { + if (v.occupied) + app.put(v.key); + } + return app.data; + } + + auto values() const + { + import std.array : appender; + auto app = appender!(uint[])(); + foreach (i, v; table) + { + if (v.occupied) + app.put(v.value); + } + return app.data; + } + +private: + static uint hashOf()(uint val) + { + return (val >> 20) ^ (val>>8) ^ val; + } + + struct Node + { + pure: + uint key_; + uint value; + @property uint key()() const { return key_ & 0x7fff_ffff; } + @property bool occupied()() const { return (key_ & 0x8000_0000) != 0; } + void setOccupied(){ key_ |= 0x8000_0000; } + } + Node[] table; + size_t items; + + static N* locate(N)(uint key, N[] table) + { + size_t slot = hashOf(key) & (table.length-1); + while (table[slot].occupied) + { + if (table[slot].key == key) + break; + slot += 1; + if (slot == table.length) + slot = 0; + } + return table.ptr + slot; + } + + void grow() + { + Node[] newTable = new Node[table.length ? table.length*2 : 4]; + foreach (i, v; table) + { + if (v.occupied) + { + auto p = locate(v.key, newTable); + *p = v; + } + } + table = newTable; + } +} + +unittest +{ + HashTab tab; + tab[3] = 1; + tab[7] = 2; + tab[11] = 3; + assert(tab[3] == 1); + assert(tab[7] == 2); + assert(tab[11] == 3); +} + + +// Specialized 2-level trie of uint masks for BitNfa. +// Uses the concept of CoW: a page gets modified in place +// if the block's ref-count is 1, else a newblock is allocated +// and ref count is decreased +struct UIntTrie2 +{ +pure: + ushort[] index; // pages --> blocks + ushort[] refCounts; // ref counts for each block + uint[] hashes; // hashes of blocks + uint[] blocks; // linear array with blocks + uint[] scratch; // temporary block + enum blockBits = 8; // size of block in bits + enum blockSize = 1<>blockBits]; + return blocks.ptr[blk*blockSize + (ch & (blockSize-1))]; + } + + void setPageRange(string op)(uint val, uint low, uint high) + { + immutable blk = index[low>>blockBits]; + if (refCounts[blk] == 1) // modify in-place + { + immutable lowIdx = blk*blockSize + (low & (blockSize-1)); + immutable highIdx = high - low + lowIdx; + mixin("blocks[lowIdx..highIdx] "~op~"= val;"); + } + else + { + // create a new page + refCounts[blk]--; + immutable lowIdx = low & (blockSize-1); + immutable highIdx = high - low + lowIdx; + scratch[] = blocks[blk*blockSize..(blk+1)*blockSize]; + mixin("scratch[lowIdx..highIdx] "~op~"= val;"); + uint h = hash(scratch); + bool found = false; + foreach (i,x; hashes) + { + if (x != h) continue; + if (scratch[] == blocks[i*blockSize .. (i+1)*blockSize]) + { + // re-route to existing page + index[low>>blockBits] = cast(ushort)i; + refCounts[i]++; // inc refs + found = true; + break; + } + } + if (!found) + { + index[low>>blockBits] = cast(ushort)hashes.length; + blocks ~= scratch[]; + refCounts ~= 1; + hashes ~= h; + } + } + } + + void opIndexOpAssign(string op)(uint val, dchar ch) + { + setPageRange!op(val, ch, ch+1); + } + + void opSliceOpAssign(string op)(uint val, uint start, uint end) + { + uint startBlk = start >> blockBits; + uint endBlk = end >> blockBits; + uint first = min(startBlk*blockSize+blockSize, end); + setPageRange!op(val, start, first); + foreach (blk; startBlk..endBlk) + setPageRange!op(val, blk*blockSize, (blk+1)*blockSize); + if (first != end) + { + setPageRange!op(val, endBlk*blockSize, end); + } + } +} + +unittest +{ + UIntTrie2 trie = UIntTrie2(); + trie['d'] &= 3; + assert(trie['d'] == 3); + trie['\u0280'] &= 1; + assert(trie['\u0280'] == 1); + import std.uni; + UIntTrie2 trie2 = UIntTrie2(); + auto letters = unicode("L"); + foreach (r; letters.byInterval) + trie2[r.a..r.b] &= 1; + foreach (ch; letters.byCodepoint) + assert(trie2[ch] == 1); + auto space = unicode("WhiteSpace"); + auto trie3 = UIntTrie2(); + foreach (r; space.byInterval) + trie3[r.a..r.b] &= 2; + foreach (ch; space.byCodepoint) + assert(trie3[ch] == 2); +} + +// Since there is no way to mark a starting position +// we need 2 instances of BitNfa: one to find the end, and the other +// to run backwards to find the start. +struct BitNfa +{ +pure: + uint[128] asciiTab; // state mask for ascii characters + UIntTrie2 uniTab; // state mask for unicode characters + HashTab controlFlow; // maps each bit pattern to resulting jumps pattern + uint controlFlowMask; // masks all control flow bits + uint finalMask; // marks final states terminating the NFA + uint length; // if this engine is empty + + @property bool empty() const { return length == 0; } + + void combineControlFlow() + { + uint[] keys = controlFlow.keys; + uint[] values = controlFlow.values; + auto selection = new bool[keys.length]; + bool nextChoice() + { + uint i; + for (i=0;i %d %s", j, ir[j].mnemonic); + paths.push(j+IRL!Option); + //writefln(">> %d", j+IRL!Option); + j = j + ir[j].data + IRL!Option; + } + break; + case GotoEndOr: + paths.push(j+IRL!GotoEndOr+ir[j].data); + break; + case OrEnd, Wordboundary, Notwordboundary, Bof, Bol, Eol, Eof, Nop, GroupStart, GroupEnd: + paths.push(j+ir[j].length); + break; + case LookaheadStart, NeglookaheadStart, LookbehindStart, + NeglookbehindStart: + paths.push(j + IRL!LookaheadStart + ir[j].data + IRL!LookaheadEnd); + break; + case InfiniteStart, InfiniteQStart: + paths.push(j+IRL!InfiniteStart); + paths.push(j+IRL!InfiniteStart+ir[j].data+IRL!InfiniteEnd); + break; + case InfiniteBloomStart: + paths.push(j+IRL!InfiniteStart); + paths.push(j+IRL!InfiniteBloomStart+ir[j].data+IRL!InfiniteBloomEnd); + break; + case InfiniteEnd, InfiniteQEnd: + paths.push(j-ir[j].data); + paths.push(j+IRL!InfiniteEnd); + break; + case InfiniteBloomEnd: + paths.push(j-ir[j].data); + paths.push(j+IRL!InfiniteBloomEnd); + break; + default: + result ~= j; + } + } + return result; + } + + this(Char)(auto ref Regex!Char re) + { + asciiTab[] = uint.max; // all ones + uniTab = UIntTrie2(); + controlFlow[0] = 0; + // pc -> bit number + uint[] bitMapping = new uint[re.ir.length]; + uint bitCount = 0, nesting=0, lastNonnested=0; + with(re) +outer: for (uint i=0; i user group number uint ngroup; // number of internal groups uint maxCounterDepth; // max depth of nested {n,m} repetitions uint hotspotTableSize; // number of entries in merge table uint threadCount; // upper bound on number of Thompson VM threads uint flags; // global regex flags - public const(CharMatcher)[] matchers; // tables that represent character sets - public const(BitTable)[] filters; // bloom filters for conditional loops + Interval[][] charsets; // intervals of characters + const(CharMatcher)[] matchers; // tables that represent character sets + const(BitTable)[] filters; // bloom filters for conditional loops uint[] backrefed; // bit array of backreferenced submatches Kickstart!Char kickstart; @@ -558,10 +674,10 @@ package(std.regex): {//@@@BUG@@@ write is system for (uint i = 0; i < ir.length; i += ir[i].length) { - writefln("%d\t%s ", i, disassemble(ir, i, dict)); + debug(std_regex_parser) writefln("%d\t%s ", i, disassemble(ir, i, dict)); } - writeln("Total merge table size: ", hotspotTableSize); - writeln("Max counter nesting depth: ", maxCounterDepth); + debug(std_regex_parser) writeln("Total merge table size: ", hotspotTableSize); + debug(std_regex_parser) writeln("Max counter nesting depth: ", maxCounterDepth); } } @@ -577,11 +693,10 @@ package(std.regex): public: Regex!Char _regex; alias _regex this; - this(Regex!Char re, MatchFn fn) + this(immutable Regex!Char re, MatchFn fn) immutable { _regex = re; nativeFn = fn; - } } @@ -622,10 +737,10 @@ struct Input(Char) @property bool atEnd(){ return _index == _origin.length; } - bool search(Kickstart)(ref Kickstart kick, ref dchar res, ref size_t pos) + + bool search(const Kickstart!Char kick, ref dchar res, ref size_t pos) { - size_t idx = kick.search(_origin, _index); - _index = idx; + kick.search(this); return nextChar(res, pos); } @@ -705,8 +820,8 @@ template BackLooper(E) } // -@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name) -{//equal is @system? +@safe uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name) +{ import std.range : assumeSorted; import std.conv : text; import std.algorithm.iteration : map; @@ -742,6 +857,7 @@ public class RegexException : Exception // simple 128-entry bit-table used with a hash function struct BitTable { +pure: uint[4] filter; this(CodepointSet set){ @@ -770,7 +886,7 @@ struct BitTable { struct CharMatcher { BitTable ascii; // fast path for ASCII Trie trie; // slow path for Unicode - +pure: this(CodepointSet set) { auto asciiSet = set & unicode.ASCII; diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d index 49f6b45573f..3f0fb806730 100644 --- a/std/regex/internal/parser.d +++ b/std/regex/internal/parser.d @@ -4,13 +4,14 @@ */ module std.regex.internal.parser; -import std.regex.internal.ir; +import std.regex.internal.ir, std.regex.internal.shiftor, + std.regex.internal.bitnfa; import std.range.primitives, std.uni, std.meta, - std.traits, std.typecons, std.exception; + std.traits, std.typecons, std.exception, std.range; static import std.ascii; // package relevant info from parser into a regex object -auto makeRegex(S, CG)(Parser!(S, CG) p) +auto makeRegex(S, CG)(Parser!(S, CG) p) pure { Regex!(BasicElementOf!S) re; auto g = p.g; @@ -21,7 +22,10 @@ auto makeRegex(S, CG)(Parser!(S, CG) p) ngroup = g.ngroup; maxCounterDepth = g.counterDepth; flags = p.re_flags; - charsets = g.charsets; + charsets = g.charsets + .map!(x => + x.byInterval.map!(x=>Interval(x.a,x.b)).array + ).array; matchers = g.matchers; backrefed = g.backrefed; re.postprocess(); @@ -76,87 +80,6 @@ unittest assert(nc.equal(cp[1 .. $ - 1])); } - -@trusted void reverseBytecode()(Bytecode[] code) -{ - Bytecode[] rev = new Bytecode[code.length]; - uint revPc = cast(uint)rev.length; - Stack!(Tuple!(uint, uint, uint)) stack; - uint start = 0; - uint end = cast(uint)code.length; - for (;;) - { - for (uint pc = start; pc < end; ) - { - immutable len = code[pc].length; - if (code[pc].code == IR.GotoEndOr) - break; //pick next alternation branch - if (code[pc].isAtom) - { - rev[revPc - len .. revPc] = code[pc .. pc + len]; - revPc -= len; - pc += len; - } - else if (code[pc].isStart || code[pc].isEnd) - { - //skip over other embedded lookbehinds they are reversed - if (code[pc].code == IR.LookbehindStart - || code[pc].code == IR.NeglookbehindStart) - { - immutable blockLen = len + code[pc].data - + code[pc].pairedLength; - rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen]; - pc += blockLen; - revPc -= blockLen; - continue; - } - immutable second = code[pc].indexOfPair(pc); - immutable secLen = code[second].length; - rev[revPc - secLen .. revPc] = code[second .. second + secLen]; - revPc -= secLen; - if (code[pc].code == IR.OrStart) - { - //we pass len bytes forward, but secLen in reverse - immutable revStart = revPc - (second + len - secLen - pc); - uint r = revStart; - uint i = pc + IRL!(IR.OrStart); - while (code[i].code == IR.Option) - { - if (code[i - 1].code != IR.OrStart) - { - assert(code[i - 1].code == IR.GotoEndOr); - rev[r - 1] = code[i - 1]; - } - rev[r] = code[i]; - auto newStart = i + IRL!(IR.Option); - auto newEnd = newStart + code[i].data; - auto newRpc = r + code[i].data + IRL!(IR.Option); - if (code[newEnd].code != IR.OrEnd) - { - newRpc--; - } - stack.push(tuple(newStart, newEnd, newRpc)); - r += code[i].data + IRL!(IR.Option); - i += code[i].data + IRL!(IR.Option); - } - pc = i; - revPc = revStart; - assert(code[pc].code == IR.OrEnd); - } - else - pc += len; - } - } - if (stack.empty) - break; - start = stack.top[0]; - end = stack.top[1]; - revPc = stack.top[2]; - stack.pop(); - } - code[] = rev[]; -} - //test if a given string starts with hex number of maxDigit that's a valid codepoint //returns it's value and skips these maxDigit chars on success, throws on failure dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit) @@ -181,7 +104,7 @@ dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit) return val; } -@system unittest //BUG canFind is system +@safe unittest { import std.algorithm.searching : canFind; string[] non_hex = [ "000j", "000z", "FffG", "0Z"]; @@ -211,7 +134,7 @@ auto caseEnclose(CodepointSet set) /+ fetch codepoint set corresponding to a name (InBlock or binary property) +/ -@trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold) +@trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold) pure { CodepointSet s = unicode(name); //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) @@ -222,35 +145,9 @@ auto caseEnclose(CodepointSet set) return s; } -//basic stack, just in case it gets used anywhere else then Parser -@trusted struct Stack(T) -{ - T[] data; - @property bool empty(){ return data.empty; } - - @property size_t length(){ return data.length; } - - void push(T val){ data ~= val; } - - T pop() - { - assert(!empty); - auto val = data[$ - 1]; - data = data[0 .. $ - 1]; - if (!__ctfe) - cast(void)data.assumeSafeAppend(); - return val; - } - - @property ref T top() - { - assert(!empty); - return data[$ - 1]; - } -} - struct CodeGen { +pure: Bytecode[] ir; // resulting bytecode Stack!(uint) fixupStack; // stack of opened start instructions NamedGroup[] dict; // maps name -> user group number @@ -335,7 +232,7 @@ struct CodeGen } if (ivals.length*2 > maxCharsetUsed) { - auto t = getMatcher(set); + auto t = CharMatcher(set); put(Bytecode(IR.Trie, cast(uint)matchers.length)); matchers ~= t; debug(std_regex_allocation) writeln("Trie generated"); @@ -616,6 +513,7 @@ enum infinite = ~0u; struct Parser(R, Generator) if (isForwardRange!R && is(ElementType!R : dchar)) { +pure: dchar _current; bool empty; R pat, origin; //keep full pattern for pretty printing error messages @@ -728,8 +626,6 @@ struct Parser(R, Generator) while (!empty) { - debug(std_regex_parser) - __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data); switch (current) { case '(': @@ -1484,11 +1380,13 @@ struct Parser(R, Generator) if (current >= privateUseStart && current <= privateUseEnd) { g.endPattern(current - privateUseStart + 1); - break; } - auto op = Bytecode(IR.Char, current); + else + { + auto op = Bytecode(IR.Char, current); + g.put(op); + } next(); - g.put(op); } } @@ -1542,7 +1440,7 @@ struct Parser(R, Generator) /+ Postproces the IR, then optimize. +/ -@trusted void postprocess(Char)(ref Regex!Char zis) +@trusted void postprocess(Char)(ref Regex!Char zis) pure {//@@@BUG@@@ write is @system with(zis) { @@ -1604,7 +1502,15 @@ struct Parser(R, Generator) } checkIfOneShot(); if (!(flags & RegexInfo.oneShot)) - kickstart = Kickstart!Char(zis, new uint[](256)); + { + kickstart = new ShiftOr!Char(zis); + if (kickstart.empty) + { + kickstart = new BitMatcher!Char(zis); + if (kickstart.empty) + kickstart = null; + } + } debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount); optimize(zis); } @@ -1654,7 +1560,7 @@ void fixupBytecode()(Bytecode[] ir) assert(fixups.empty); } -void optimize(Char)(ref Regex!Char zis) +void optimize(Char)(ref Regex!Char zis) pure { import std.array : insertInPlace; CodepointSet nextSet(uint idx) @@ -1671,7 +1577,7 @@ void optimize(Char)(ref Regex!Char zis) goto default; //TODO: OrChar case Trie, CodepointSet: - set = zis.charsets[ir[i].data]; + set = .CodepointSet(zis.charsets[ir[i].data]); goto default; case GroupStart,GroupEnd: break; diff --git a/std/regex/internal/kickstart.d b/std/regex/internal/shiftor.d similarity index 75% rename from std/regex/internal/kickstart.d rename to std/regex/internal/shiftor.d index f052a955509..48bfebfebe8 100644 --- a/std/regex/internal/kickstart.d +++ b/std/regex/internal/shiftor.d @@ -1,8 +1,8 @@ /* - Kickstart is a coarse-grained "filter" engine that finds likely matches - to be verified by full-blown matcher. + ShiftOr is a kickstart engine, a coarse-grained "filter" engine that finds + potential matches to be verified by a full-blown matcher. */ -module std.regex.internal.kickstart; +module std.regex.internal.shiftor; package(std.regex): @@ -26,9 +26,10 @@ uint effectiveSize(Char)() Kickstart engine using ShiftOr algorithm, a bit parallel technique for inexact string searching. */ -struct ShiftOr(Char) +class ShiftOr(Char) : Kickstart!Char { private: +pure: uint[] table; uint fChar; uint n_length; @@ -115,8 +116,8 @@ private: { auto t = worklist[$-1]; worklist.length -= 1; - if (!__ctfe) - cast(void)worklist.assumeSafeAppend(); + //if (!__ctfe) + // cast(void)worklist.assumeSafeAppend(); return t; } @@ -127,13 +128,13 @@ private: } public: - @trusted this(ref Regex!Char re, uint[] memory) + @trusted this(ref Regex!Char re) { static import std.algorithm.comparison; import std.algorithm.searching : countUntil; import std.conv : text; import std.range : assumeSorted; - assert(memory.length == 256); + uint[] memory = new uint[256]; fChar = uint.max; // FNV-1a flavored hash (uses 32bits at a time) ulong hash(uint[] tab) @@ -241,9 +242,9 @@ public: static immutable codeBounds = [0x0, 0x7F, 0x80, 0x7FF, 0x800, 0xFFFF, 0x10000, 0x10FFFF]; else //== 2 static immutable codeBounds = [0x0, 0xFFFF, 0x10000, 0x10FFFF]; - uint[] arr = new uint[set.byInterval.length * 2]; + uint[] arr = new uint[set.length * 2]; size_t ofs = 0; - foreach (ival; set.byInterval) + foreach (ival; set) { arr[ofs++] = ival.a; arr[ofs++] = ival.b; @@ -262,7 +263,8 @@ public: auto chars = set.length; if (chars > charsetThreshold) goto L_StopThread; - foreach (ch; set.byCodepoint) + foreach (ival; set) + foreach (ch; ival.a..ival.b) { //avoid surrogate pairs if (0xD800 <= ch && ch <= 0xDFFF) @@ -339,25 +341,6 @@ public: t.pc += IRL!(IR.RepeatEnd); } break; - case IR.InfiniteStart, IR.InfiniteQStart: - t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart); - goto case IR.InfiniteEnd; //both Q and non-Q - case IR.InfiniteEnd: - case IR.InfiniteQEnd: - auto slot = re.ir[t.pc+1].raw+t.counter; - auto val = hash(t.tab); - if (val in merge[slot]) - goto L_StopThread; // merge equivalent - merge[slot][val] = true; - uint len = re.ir[t.pc].data; - uint pc1, pc2; //branches to take in priority order - if (++t.hops == 32) - goto L_StopThread; - pc1 = t.pc + IRL!(IR.InfiniteEnd); - pc2 = t.pc - len; - trs ~= fork(t, pc2, t.counter); - t.pc = pc1; - break; case IR.GroupStart, IR.GroupEnd: t.pc += IRL!(IR.GroupStart); break; @@ -370,7 +353,6 @@ public: default: L_StopThread: assert(re.ir[t.pc].code >= 0x80, text(re.ir[t.pc].code)); - debug (fred_search) writeln("ShiftOr stumbled on ",re.ir[t.pc].mnemonic); n_length = std.algorithm.comparison.min(t.idx, n_length); break L_Eval_Thread; } @@ -385,22 +367,23 @@ public: } } - @property bool empty() const { return n_length == 0; } + final @property bool empty() const { return n_length < 3 && fChar == uint.max; } - @property uint length() const{ return n_length/charSize; } + final @property uint length() const{ return n_length/charSize; } // lookup compatible bit pattern in haystack, return starting index // has a useful trait: if supplied with valid UTF indexes, // returns only valid UTF indexes // (that given the haystack in question is valid UTF string) - @trusted size_t search(const(Char)[] haystack, size_t idx) + final @trusted bool search(ref Input!Char s) const {//@BUG: apparently assumes little endian machines import std.conv : text; import core.stdc.string : memchr; assert(!empty); - auto p = cast(const(ubyte)*)(haystack.ptr+idx); + auto haystack = s._origin; uint state = uint.max; uint limit = 1u<<(n_length - 1u); + auto p = cast(const(ubyte)*)(haystack.ptr+s._index); debug(std_regex_search) writefln("Limit: %32b",limit); if (fChar != uint.max) { @@ -415,11 +398,17 @@ public: assert(p <= end, text(p," vs ", end)); p = cast(ubyte*)memchr(p, fChar, end - p); if (!p) - return haystack.length; + { + s._index = haystack.length; + return false; + } if ((cast(size_t)p & (Char.sizeof-1)) == orginalAlign) break; if (++p == end) - return haystack.length; + { + s._index = haystack.length; + return false; + } } state = ~1u; assert((cast(size_t)p & (Char.sizeof-1)) == orginalAlign); @@ -433,8 +422,10 @@ public: p++; //first char is tested, see if that's all if (!(state & limit)) - return (p-cast(ubyte*)haystack.ptr)/Char.sizeof - -length; + { + s._index = (p-cast(ubyte*)haystack.ptr)/Char.sizeof-length; + return true; + } } else {//have some bits/states for possible matches, @@ -452,8 +443,10 @@ public: p++; } if (!(state & limit)) - return (p-cast(ubyte*)haystack.ptr)/Char.sizeof - -length; + { + s._index = (p-cast(ubyte*)haystack.ptr)/Char.sizeof-length; + return true; + } } debug(std_regex_search) writefln("State: %32b", state); } @@ -471,8 +464,10 @@ public: state = (state<<1) | table[p[2]]; p += 4; if (!(state & limit))//division rounds down for dchar - return (p-cast(ubyte*)haystack.ptr)/Char.sizeof - -length; + { + s._index = (p-cast(ubyte*)haystack.ptr)/Char.sizeof-length; + return true; + } } } else @@ -483,23 +478,37 @@ public: { state = (state<<1) | table[p[i++]]; if (!(state & limit)) - return idx+i/Char.sizeof-length; + { + s._index += i/Char.sizeof-length; + return true; + } } while (i < len) { state = (state<<1) | table[p[i++]]; if (!(state & limit)) - return idx+i/Char.sizeof - -length; + { + s._index += i/Char.sizeof-length; + return true; + } state = (state<<1) | table[p[i++]]; if (!(state & limit)) - return idx+i/Char.sizeof - -length; + { + s._index += i/Char.sizeof-length; + return true; + } debug(std_regex_search) writefln("State: %32b", state); } } } - return haystack.length; + s._index = haystack.length; + return false; + } + + final @trusted bool match(ref Input!Char s) const + { + //TODO: stub + return false; } @system debug static void dump(uint[] table) @@ -507,7 +516,7 @@ public: import std.stdio : writefln; for (size_t i = 0; i < table.length; i += 4) { - writefln("%32b %32b %32b %32b",table[i], table[i+1], table[i+2], table[i+3]); + debug writefln("%32b %32b %32b %32b",table[i], table[i+1], table[i+2], table[i+3]); } } } @@ -515,65 +524,59 @@ public: unittest { import std.conv, std.regex; - @trusted void test_fixed(alias Kick)() + auto shiftOrLength(C)(const(C)[] pat, uint length) { - foreach (i, v; AliasSeq!(char, wchar, dchar)) + auto r = regex(pat, "s"); + auto kick = new ShiftOr!C(r); + assert(kick.length == length, text(C.stringof, " == ", kick.length)); + return kick; + } + auto searches(C)(const (C)[] source, ShiftOr!C kick, uint[] results...) + { + auto inp = Input!C(source); + foreach (r; results) { - alias Char = v; - alias String = immutable(v)[]; - auto r = regex(to!String(`abc$`)); - auto kick = Kick!Char(r, new uint[256]); - assert(kick.length == 3, text(Kick.stringof," ",v.stringof, " == ", kick.length)); - auto r2 = regex(to!String(`(abc){2}a+`)); - kick = Kick!Char(r2, new uint[256]); - assert(kick.length == 7, text(Kick.stringof,v.stringof," == ", kick.length)); - auto r3 = regex(to!String(`\b(a{2}b{3}){2,4}`)); - kick = Kick!Char(r3, new uint[256]); - assert(kick.length == 10, text(Kick.stringof,v.stringof," == ", kick.length)); - auto r4 = regex(to!String(`\ba{2}c\bxyz`)); - kick = Kick!Char(r4, new uint[256]); - assert(kick.length == 6, text(Kick.stringof,v.stringof, " == ", kick.length)); - auto r5 = regex(to!String(`\ba{2}c\b`)); - kick = Kick!Char(r5, new uint[256]); - size_t x = kick.search("aabaacaa", 0); - assert(x == 3, text(Kick.stringof,v.stringof," == ", kick.length)); - x = kick.search("aabaacaa", x+1); - assert(x == 8, text(Kick.stringof,v.stringof," == ", kick.length)); + kick.search(inp); + dchar ch; + size_t idx; + assert(inp._index == r, text(inp._index, " vs ", r)); + inp.nextChar(ch, idx); } } - @trusted void test_flex(alias Kick)() + + + foreach (i, Char; AliasSeq!(char, wchar, dchar)) { - foreach (i, v; AliasSeq!(char, wchar, dchar)) - { - alias Char = v; - alias String = immutable(v)[]; - auto r = regex(to!String(`abc[a-z]`)); - auto kick = Kick!Char(r, new uint[256]); - auto x = kick.search(to!String("abbabca"), 0); - assert(x == 3, text("real x is ", x, " ",v.stringof)); + alias String = immutable(Char)[]; + shiftOrLength(`abc`.to!String, 3); + shiftOrLength(`abc$`.to!String, 3); + shiftOrLength(`(abc){2}a+`.to!String, 7); + shiftOrLength(`\b(a{2}b{3}){2,4}`.to!String, 10); + shiftOrLength(`\ba{2}c\bxyz`.to!String, 6); + auto kick = shiftOrLength(`\ba{2}c\b`.to!String, 3); + auto inp = Input!Char("aabaacaa"); + assert(kick.search(inp)); + assert(inp._index == 3, text(Char.stringof," == ", kick.length)); + dchar ch; + size_t idx; + inp.nextChar(ch, idx); + assert(!kick.search(inp)); + assert(inp._index == 8, text(Char.stringof," == ", kick.length)); + } - auto r2 = regex(to!String(`(ax|bd|cdy)`)); - String s2 = to!String("abdcdyabax"); - kick = Kick!Char(r2, new uint[256]); - x = kick.search(s2, 0); - assert(x == 1, text("real x is ", x)); - x = kick.search(s2, x+1); - assert(x == 3, text("real x is ", x)); - x = kick.search(s2, x+1); - assert(x == 8, text("real x is ", x)); - auto rdot = regex(to!String(`...`)); - kick = Kick!Char(rdot, new uint[256]); - assert(kick.length == 0); - auto rN = regex(to!String(`a(b+|c+)x`)); - kick = Kick!Char(rN, new uint[256]); - assert(kick.length == 3, to!string(kick.length)); - assert(kick.search("ababx",0) == 2); - assert(kick.search("abaacba",0) == 3);//expected inexact + foreach (i, Char; AliasSeq!(char, wchar, dchar)) + { + alias String = immutable(Char)[]; + auto kick = shiftOrLength(`abc[a-z]`.to!String, 4); + searches("abbabca".to!String, kick, 3); + kick = shiftOrLength(`(axx|bdx|cdy)`.to!String, 3); + searches("abdcdxabax".to!String, kick, 3); - } + shiftOrLength(`...`.to!String, 0); + kick = shiftOrLength(`a(b{1,2}|c{1,2})x`.to!String, 3); + searches("ababx".to!String, kick, 2); + searches("abaacba".to!String, kick, 3); //expected inexact } - test_fixed!(ShiftOr)(); - test_flex!(ShiftOr)(); + } -alias Kickstart = ShiftOr; diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index a098fcc431c..4f52f819c5d 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -8,8 +8,6 @@ package(std.regex): import std.conv, std.exception, std.meta, std.range, std.typecons, std.regex; -import std.regex.internal.parser : Escapables; // characters that need escaping - alias Sequence(int B, int E) = staticIota!(B, E); unittest @@ -315,6 +313,7 @@ unittest TestVectors( `\b[A-Za-z0-9.]+(?=(@(?!gmail)))`, "a@gmail,x@com", "y", "$&-$1", "x-@"), TestVectors( `x()(abc)(?=(d)(e)(f)\2)`, "xabcdefabc", "y", "$&", "xabc"), TestVectors( `x()(abc)(?=(d)(e)(f)()\3\4\5)`, "xabcdefdef", "y", "$&", "xabc"), + //lookback TestVectors( `(?<=(ab))\d`, "12ba3ab4", "y", "$&-$1", "4-ab", "i"), TestVectors( `\w(?"); - assert(bmatch("texttext", greed).hit - == "text"); -} - -unittest -{ - import std.algorithm.comparison : equal; - auto cr8 = ctRegex!("^(a)(b)?(c*)"); - auto m8 = bmatch("abcc",cr8); - assert(m8); - assert(m8.captures[1] == "a"); - assert(m8.captures[2] == "b"); - assert(m8.captures[3] == "cc"); - auto cr9 = ctRegex!("q(a|b)*q"); - auto m9 = match("xxqababqyy",cr9); - assert(m9); - assert(equal(bmatch("xxqababqyy",cr9).captures, ["qababq", "b"])); -} - -unittest -{ - import std.algorithm.comparison : equal; - auto rtr = regex("a|b|c"); - enum ctr = regex("a|b|c"); - assert(equal(rtr.ir,ctr.ir)); - //CTFE parser BUG is triggered by group - //in the middle of alternation (at least not first and not last) - enum testCT = regex(`abc|(edf)|xyz`); - auto testRT = regex(`abc|(edf)|xyz`); - assert(equal(testCT.ir,testRT.ir)); -} - -unittest -{ - import std.algorithm.iteration : map; - import std.algorithm.comparison : equal; - enum cx = ctRegex!"(A|B|C)"; - auto mx = match("B",cx); - assert(mx); - assert(equal(mx.captures, [ "B", "B"])); - enum cx2 = ctRegex!"(A|B)*"; - assert(match("BAAA",cx2)); - - enum cx3 = ctRegex!("a{3,4}","i"); - auto mx3 = match("AaA",cx3); - assert(mx3); - assert(mx3.captures[0] == "AaA"); - enum cx4 = ctRegex!(`^a{3,4}?[a-zA-Z0-9~]{1,2}`,"i"); - auto mx4 = match("aaaabc", cx4); - assert(mx4); - assert(mx4.captures[0] == "aaaab"); - auto cr8 = ctRegex!("(a)(b)?(c*)"); - auto m8 = bmatch("abcc",cr8); - assert(m8); - assert(m8.captures[1] == "a"); - assert(m8.captures[2] == "b"); - assert(m8.captures[3] == "cc"); - auto cr9 = ctRegex!(".*$", "gm"); - auto m9 = match("First\rSecond", cr9); - assert(m9); - assert(equal(map!"a.hit"(m9), ["First", "", "Second"])); -} - -unittest -{ - import std.algorithm.iteration : map; - import std.algorithm.comparison : equal; -//global matching - void test_body(alias matchFn)() - { - string s = "a quick brown fox jumps over a lazy dog"; - auto r1 = regex("\\b[a-z]+\\b","g"); - string[] test; - foreach (m; matchFn(s, r1)) - test ~= m.hit; - assert(equal(test, [ "a", "quick", "brown", "fox", "jumps", "over", "a", "lazy", "dog"])); - auto free_reg = regex(` - - abc - \s+ - " - ( - [^"]+ - | \\ " - )+ - " - z - `, "x"); - auto m = match(`abc "quoted string with \" inside"z`,free_reg); - assert(m); - string mails = " hey@you.com no@spam.net "; - auto rm = regex(`@(?<=\S+@)\S+`,"g"); - assert(equal(map!"a[0]"(matchFn(mails, rm)), ["@you.com", "@spam.net"])); - auto m2 = matchFn("First line\nSecond line",regex(".*$","gm")); - assert(equal(map!"a[0]"(m2), ["First line", "", "Second line"])); - auto m2a = matchFn("First line\nSecond line",regex(".+$","gm")); - assert(equal(map!"a[0]"(m2a), ["First line", "Second line"])); - auto m2b = matchFn("First line\nSecond line",regex(".+?$","gm")); - assert(equal(map!"a[0]"(m2b), ["First line", "Second line"])); - debug(std_regex_test) writeln("!!! FReD FLAGS test done "~matchFn.stringof~" !!!"); - } - test_body!bmatch(); - test_body!match(); -} - -//tests for accumulated std.regex issues and other regressions -unittest -{ - import std.algorithm.iteration : map; - import std.algorithm.comparison : equal; - void test_body(alias matchFn)() - { - //issue 5857 - //matching goes out of control if ... in (...){x} has .*/.+ - auto c = matchFn("axxxzayyyyyzd",regex("(a.*z){2}d")).captures; - assert(c[0] == "axxxzayyyyyzd"); - assert(c[1] == "ayyyyyz"); - auto c2 = matchFn("axxxayyyyyd",regex("(a.*){2}d")).captures; - assert(c2[0] == "axxxayyyyyd"); - assert(c2[1] == "ayyyyy"); - //issue 2108 - //greedy vs non-greedy - auto nogreed = regex(""); - assert(matchFn("texttext", nogreed).hit - == "text"); - auto greed = regex(""); - assert(matchFn("texttext", greed).hit - == "texttext"); - //issue 4574 - //empty successful match still advances the input - string[] pres, posts, hits; - foreach (m; matchFn("abcabc", regex("","g"))) - { - pres ~= m.pre; - posts ~= m.post; - assert(m.hit.empty); - - } - auto heads = [ - "abcabc", - "abcab", - "abca", - "abc", - "ab", - "a", - "" - ]; - auto tails = [ - "abcabc", - "bcabc", - "cabc", - "abc", - "bc", - "c", - "" - ]; - assert(pres == array(retro(heads))); - assert(posts == tails); - //issue 6076 - //regression on .* - auto re = regex("c.*|d"); - auto m = matchFn("mm", re); - assert(!m); - debug(std_regex_test) writeln("!!! FReD REGRESSION test done "~matchFn.stringof~" !!!"); - auto rprealloc = regex(`((.){5}.{1,10}){5}`); - auto arr = array(repeat('0',100)); - auto m2 = matchFn(arr, rprealloc); - assert(m2); - assert(collectException( - regex(r"^(import|file|binary|config)\s+([^\(]+)\(?([^\)]*)\)?\s*$") - ) is null); - foreach (ch; [Escapables]) - { - assert(match(to!string(ch),regex(`[\`~ch~`]`))); - assert(!match(to!string(ch),regex(`[^\`~ch~`]`))); - assert(match(to!string(ch),regex(`[\`~ch~`-\`~ch~`]`))); - } - //bugzilla 7718 - string strcmd = "./myApp.rb -os OSX -path \"/GIT/Ruby Apps/sec\" -conf 'notimer'"; - auto reStrCmd = regex (`(".*")|('.*')`, "g"); - assert(equal(map!"a[0]"(matchFn(strcmd, reStrCmd)), - [`"/GIT/Ruby Apps/sec"`, `'notimer'`])); - } - test_body!bmatch(); - test_body!match(); -} - -// tests for replace -unittest -{ - void test(alias matchFn)() - { - import std.uni : toUpper; - - foreach (i, v; AliasSeq!(string, wstring, dstring)) - { - auto baz(Cap)(Cap m) - if (is(Cap == Captures!(Cap.String))) - { - return toUpper(m.hit); - } - alias String = v; - assert(std.regex.replace!(matchFn)(to!String("ark rapacity"), regex(to!String("r")), to!String("c")) - == to!String("ack rapacity")); - assert(std.regex.replace!(matchFn)(to!String("ark rapacity"), regex(to!String("r"), "g"), to!String("c")) - == to!String("ack capacity")); - assert(std.regex.replace!(matchFn)(to!String("noon"), regex(to!String("^n")), to!String("[$&]")) - == to!String("[n]oon")); - assert(std.regex.replace!(matchFn)( - to!String("test1 test2"), regex(to!String(`\w+`),"g"), to!String("$`:$'") - ) == to!String(": test2 test1 :")); - auto s = std.regex.replace!(baz!(Captures!(String)))(to!String("Strap a rocket engine on a chicken."), - regex(to!String("[ar]"), "g")); - assert(s == "StRAp A Rocket engine on A chicken."); - } - debug(std_regex_test) writeln("!!! Replace test done "~matchFn.stringof~" !!!"); - } - test!(bmatch)(); - test!(match)(); -} - -// tests for splitter -unittest -{ - import std.algorithm.comparison : equal; - auto s1 = ", abc, de, fg, hi, "; - auto sp1 = splitter(s1, regex(", *")); - auto w1 = ["", "abc", "de", "fg", "hi", ""]; - assert(equal(sp1, w1)); - - auto s2 = ", abc, de, fg, hi"; - auto sp2 = splitter(s2, regex(", *")); - auto w2 = ["", "abc", "de", "fg", "hi"]; - - uint cnt; - foreach (e; sp2) - { - assert(w2[cnt++] == e); - } - assert(equal(sp2, w2)); -} - -unittest -{ - char[] s1 = ", abc, de, fg, hi, ".dup; - auto sp2 = splitter(s1, regex(", *")); -} - -unittest -{ - import std.algorithm.comparison : equal; - auto s1 = ", abc, de, fg, hi, "; - auto w1 = ["", "abc", "de", "fg", "hi", ""]; - assert(equal(split(s1, regex(", *")), w1[])); -} - -unittest -{ // bugzilla 7141 - string pattern = `[a\--b]`; - assert(match("-", pattern)); - assert(match("b", pattern)); - string pattern2 = `[&-z]`; - assert(match("b", pattern2)); -} -unittest -{//bugzilla 7111 - assert(match("", regex("^"))); -} -unittest -{//bugzilla 7300 - assert(!match("a"d, "aa"d)); -} - -// bugzilla 7551 -unittest -{ - auto r = regex("[]abc]*"); - assert("]ab".matchFirst(r).hit == "]ab"); - assertThrown(regex("[]")); - auto r2 = regex("[]abc--ab]*"); - assert("]ac".matchFirst(r2).hit == "]"); -} - -unittest -{//bugzilla 7674 - assert("1234".replace(regex("^"), "$$") == "$1234"); - assert("hello?".replace(regex(r"\?", "g"), r"\?") == r"hello\?"); - assert("hello?".replace(regex(r"\?", "g"), r"\\?") != r"hello\?"); -} -unittest -{// bugzilla 7679 - import std.algorithm.comparison : equal; - foreach (S; AliasSeq!(string, wstring, dstring)) - (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 - enum re = ctRegex!(to!S(r"\.")); - auto str = to!S("a.b"); - assert(equal(std.regex.splitter(str, re), [to!S("a"), to!S("b")])); - assert(split(str, re) == [to!S("a"), to!S("b")]); - }(); -} -unittest -{//bugzilla 8203 - string data = " - NAME = XPAW01_STA:STATION - NAME = XPAW01_STA - "; - auto uniFileOld = data; - auto r = regex( - r"^NAME = (?P[a-zA-Z0-9_]+):*(?P[a-zA-Z0-9_]*)","gm"); - auto uniCapturesNew = match(uniFileOld, r); - for (int i = 0; i < 20; i++) - foreach (matchNew; uniCapturesNew) {} - //a second issue with same symptoms - auto r2 = regex(`([а-яА-Я\-_]+\s*)+(?<=[\s\.,\^])`); - match("аллея Театральная", r2); -} -unittest -{// bugzilla 8637 purity of enforce - auto m = match("hello world", regex("world")); - enforce(m); -} - -// bugzilla 8725 -unittest -{ - static italic = regex( r"\* - (?!\s+) - (.*?) - (?!\s+) - \*", "gx" ); - string input = "this * is* interesting, *very* interesting"; - assert(replace(input, italic, "$1") == - "this * is* interesting, very interesting"); -} - -// bugzilla 8349 -unittest -{ - enum peakRegexStr = r"\>(wgEncode.*Tfbs.*\.(?:narrow)|(?:broad)Peak.gz)"; - enum peakRegex = ctRegex!(peakRegexStr); - //note that the regex pattern itself is probably bogus - assert(match(r"\>wgEncode-blah-Tfbs.narrow", peakRegex)); -} - -// bugzilla 9211 -unittest -{ - import std.algorithm.comparison : equal; - auto rx_1 = regex(r"^(\w)*(\d)"); - auto m = match("1234", rx_1); - assert(equal(m.front, ["1234", "3", "4"])); - auto rx_2 = regex(r"^([0-9])*(\d)"); - auto m2 = match("1234", rx_2); - assert(equal(m2.front, ["1234", "3", "4"])); -} - -// bugzilla 9280 -unittest -{ - string tomatch = "a!b@c"; - static r = regex(r"^(?P.*?)!(?P.*?)@(?P.*?)$"); - auto nm = match(tomatch, r); - assert(nm); - auto c = nm.captures; - assert(c[1] == "a"); - assert(c["nick"] == "a"); -} - - -// bugzilla 9579 -unittest -{ - char[] input = ['a', 'b', 'c']; - string format = "($1)"; - // used to give a compile error: - auto re = regex(`(a)`, "g"); - auto r = replace(input, re, format); - assert(r == "(a)bc"); -} - -// bugzilla 9634 -unittest -{ - auto re = ctRegex!"(?:a+)"; - assert(match("aaaa", re).hit == "aaaa"); -} - -//bugzilla 10798 -unittest -{ - auto cr = ctRegex!("[abcd--c]*"); - auto m = "abc".match(cr); - assert(m); - assert(m.hit == "ab"); -} - -// bugzilla 10913 -unittest -{ - @system static string foo(const(char)[] s) - { - return s.dup; - } - @safe static string bar(const(char)[] s) - { - return s.dup; - } - () @system { - replace!((a) => foo(a.hit))("blah", regex(`a`)); - }(); - () @safe { - replace!((a) => bar(a.hit))("blah", regex(`a`)); - }(); -} - -// bugzilla 11262 -unittest -{ - enum reg = ctRegex!(r",", "g"); - auto str = "This,List"; - str = str.replace(reg, "-"); - assert(str == "This-List"); -} - -// bugzilla 11775 -unittest -{ - assert(collectException(regex("a{1,0}"))); -} - -// bugzilla 11839 -unittest -{ - import std.algorithm.comparison : equal; - assert(regex(`(?P\w+)`).namedCaptures.equal(["var1"])); - assert(collectException(regex(`(?P<1>\w+)`))); - assert(regex(`(?P\w+)`).namedCaptures.equal(["v1"])); - assert(regex(`(?P<__>\w+)`).namedCaptures.equal(["__"])); - assert(regex(`(?P<я>\w+)`).namedCaptures.equal(["я"])); -} - -// bugzilla 12076 -unittest -{ - auto RE = ctRegex!(r"(?abc)`); - assert(collectException("abc".matchFirst(r)["b"])); -} - -// bugzilla 12691 -unittest -{ - assert(bmatch("e@", "^([a-z]|)*$").empty); - assert(bmatch("e@", ctRegex!`^([a-z]|)*$`).empty); -} - -//bugzilla 12713 -unittest -{ - assertThrown(regex("[[a-z]([a-z]|(([[a-z])))")); -} - -//bugzilla 12747 -unittest -{ - assertThrown(regex(`^x(\1)`)); - assertThrown(regex(`^(x(\1))`)); - assertThrown(regex(`^((x)(?=\1))`)); -} - -// bugzilla 14504 -unittest -{ - auto p = ctRegex!("a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?" ~ - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); -} - -// bugzilla 14529 -unittest -{ - auto ctPat2 = regex(r"^[CDF]$", "i"); - foreach (v; ["C", "c", "D", "d", "F", "f"]) - assert(matchAll(v, ctPat2).front.hit == v); -} - -// bugzilla 14615 -unittest -{ - import std.stdio : writeln; - import std.regex : replaceFirst, replaceFirstInto, regex; - import std.array : appender; - - auto example = "Hello, world!"; - auto pattern = regex("^Hello, (bug)"); // won't find this one - auto result = replaceFirst(example, pattern, "$1 Sponge Bob"); - assert(result == "Hello, world!"); // Ok. - - auto sink = appender!string; - replaceFirstInto(sink, example, pattern, "$1 Sponge Bob"); - assert(sink.data == "Hello, world!"); - replaceAllInto(sink, example, pattern, "$1 Sponge Bob"); - assert(sink.data == "Hello, world!Hello, world!"); -} - -// bugzilla 15573 -unittest -{ - auto rx = regex("[c d]", "x"); - assert("a b".matchFirst(rx)); -} - -// bugzilla 15864 -unittest -{ - regex(`("); + assert(bmatch("texttext", greed).hit + == "text"); +} + +unittest +{ + auto cr8 = ctRegex!("^(a)(b)?(c*)"); + auto m8 = bmatch("abcc",cr8); + assert(m8); + assert(m8.captures[1] == "a"); + assert(m8.captures[2] == "b"); + assert(m8.captures[3] == "cc"); + auto cr9 = ctRegex!("q(a|b)*q"); + auto m9 = match("xxqababqyy",cr9); + assert(m9); + assert(equal(bmatch("xxqababqyy",cr9).captures, ["qababq", "b"])); +} + +unittest +{ + auto rtr = regex("a|b|c"); + const ctr = regex("a|b|c"); + assert(equal(rtr.ir,ctr.ir)); + //CTFE parser BUG is triggered by group + //in the middle of alternation (at least not first and not last) + const testCT = regex(`abc|(edf)|xyz`); + auto testRT = regex(`abc|(edf)|xyz`); + assert(equal(testCT.ir,testRT.ir)); +} + +unittest +{ + immutable cx = ctRegex!"(A|B|C)"; + auto mx = match("B",cx); + assert(mx); + assert(equal(mx.captures, [ "B", "B"])); + immutable cx2 = ctRegex!"(A|B)*"; + assert(match("BAAA",cx2)); + + immutable cx3 = ctRegex!("a{3,4}","i"); + auto mx3 = match("AaA",cx3); + assert(mx3); + assert(mx3.captures[0] == "AaA"); + immutable cx4 = ctRegex!(`^a{3,4}?[a-zA-Z0-9~]{1,2}`,"i"); + auto mx4 = match("aaaabc", cx4); + assert(mx4); + assert(mx4.captures[0] == "aaaab"); + auto cr8 = ctRegex!("(a)(b)?(c*)"); + auto m8 = bmatch("abcc",cr8); + assert(m8); + assert(m8.captures[1] == "a"); + assert(m8.captures[2] == "b"); + assert(m8.captures[3] == "cc"); + auto cr9 = ctRegex!(".*$", "gm"); + auto m9 = match("First\rSecond", cr9); + assert(m9); + assert(equal(map!"a.hit"(m9), ["First", "", "Second"])); +} + +unittest +{ +//global matching + void test_body(alias matchFn)() + { + string s = "a quick brown fox jumps over a lazy dog"; + auto r1 = regex("\\b[a-z]+\\b","g"); + string[] test; + foreach (m; matchFn(s, r1)) + test ~= m.hit; + assert(equal(test, [ "a", "quick", "brown", "fox", "jumps", "over", "a", "lazy", "dog"])); + auto free_reg = regex(` + + abc + \s+ + " + ( + [^"]+ + | \\ " + )+ + " + z + `, "x"); + auto m = match(`abc "quoted string with \" inside"z`,free_reg); + assert(m); + string mails = " hey@you.com no@spam.net "; + auto rm = regex(`@(?<=\S+@)\S+`,"g"); + assert(equal(map!"a[0]"(matchFn(mails, rm)), ["@you.com", "@spam.net"])); + auto m2 = matchFn("First line\nSecond line",regex(".*$","gm")); + assert(equal(map!"a[0]"(m2), ["First line", "", "Second line"])); + auto m2a = matchFn("First line\nSecond line",regex(".+$","gm")); + assert(equal(map!"a[0]"(m2a), ["First line", "Second line"])); + auto m2b = matchFn("First line\nSecond line",regex(".+?$","gm")); + assert(equal(map!"a[0]"(m2b), ["First line", "Second line"])); + debug(std_regex_test) writeln("!!! FReD FLAGS test done "~matchFn.stringof~" !!!"); + } + test_body!bmatch(); + test_body!match(); +} + +//tests for accumulated std.regex issues and other regressions +unittest +{ + void test_body(alias matchFn)() + { + //issue 5857 + //matching goes out of control if ... in (...){x} has .*/.+ + auto c = matchFn("axxxzayyyyyzd",regex("(a.*z){2}d")).captures; + assert(c[0] == "axxxzayyyyyzd"); + assert(c[1] == "ayyyyyz"); + auto c2 = matchFn("axxxayyyyyd",regex("(a.*){2}d")).captures; + assert(c2[0] == "axxxayyyyyd"); + assert(c2[1] == "ayyyyy"); + //issue 2108 + //greedy vs non-greedy + auto nogreed = regex(""); + assert(matchFn("texttext", nogreed).hit + == "text"); + auto greed = regex(""); + assert(matchFn("texttext", greed).hit + == "texttext"); + //issue 4574 + //empty successful match still advances the input + string[] pres, posts, hits; + foreach (m; matchFn("abcabc", regex("","g"))) { + pres ~= m.pre; + posts ~= m.post; + assert(m.hit.empty); + + } + auto heads = [ + "abcabc", + "abcab", + "abca", + "abc", + "ab", + "a", + "" + ]; + auto tails = [ + "abcabc", + "bcabc", + "cabc", + "abc", + "bc", + "c", + "" + ]; + assert(pres == array(retro(heads))); + assert(posts == tails); + //issue 6076 + //regression on .* + auto re = regex("c.*|d"); + auto m = matchFn("mm", re); + assert(!m); + debug(std_regex_test) writeln("!!! FReD REGRESSION test done "~matchFn.stringof~" !!!"); + auto rprealloc = regex(`((.){5}.{1,10}){5}`); + auto arr = array(repeat('0',100)); + auto m2 = matchFn(arr, rprealloc); + assert(m2); + assert(collectException( + regex(r"^(import|file|binary|config)\s+([^\(]+)\(?([^\)]*)\)?\s*$") + ) is null); + foreach (ch; [Escapables]) + { + assert(match(to!string(ch),regex(`[\`~ch~`]`))); + assert(!match(to!string(ch),regex(`[^\`~ch~`]`))); + assert(match(to!string(ch),regex(`[\`~ch~`-\`~ch~`]`))); + } + //bugzilla 7718 + string strcmd = "./myApp.rb -os OSX -path \"/GIT/Ruby Apps/sec\" -conf 'notimer'"; + auto reStrCmd = regex (`(".*")|('.*')`, "g"); + assert(equal(map!"a[0]"(matchFn(strcmd, reStrCmd)), + [`"/GIT/Ruby Apps/sec"`, `'notimer'`])); + } + test_body!bmatch(); + test_body!match(); +} + +// tests for replace +unittest +{ + void test(alias matchFn)() + { + import std.uni : toUpper; + + foreach (i, v; AliasSeq!(string, wstring, dstring)) + { + auto baz(Cap)(Cap m) + if (is(Cap == Captures!(Cap.String))) + { + return toUpper(m.hit); + } + alias String = v; + assert(std.regex.replace!(matchFn)(to!String("ark rapacity"), regex(to!String("r")), to!String("c")) + == to!String("ack rapacity")); + assert(std.regex.replace!(matchFn)(to!String("ark rapacity"), regex(to!String("r"), "g"), to!String("c")) + == to!String("ack capacity")); + assert(std.regex.replace!(matchFn)(to!String("noon"), regex(to!String("^n")), to!String("[$&]")) + == to!String("[n]oon")); + assert(std.regex.replace!(matchFn)(to!String("test1 test2"), regex(to!String(`\w+`),"g"), to!String("$`:$'")) + == to!String(": test2 test1 :")); + auto s = std.regex.replace!(baz!(Captures!(String)))(to!String("Strap a rocket engine on a chicken."), + regex(to!String("[ar]"), "g")); + assert(s == "StRAp A Rocket engine on A chicken.", text(s)); + } + debug(std_regex_test) writeln("!!! Replace test done "~matchFn.stringof~" !!!"); + } + test!(bmatch)(); + test!(match)(); +} + +// tests for splitter +unittest +{ + auto s1 = ", abc, de, fg, hi, "; + auto sp1 = splitter(s1, regex(", *")); + auto w1 = ["", "abc", "de", "fg", "hi", ""]; + assert(equal(sp1, w1)); + + auto s2 = ", abc, de, fg, hi"; + auto sp2 = splitter(s2, regex(", *")); + auto w2 = ["", "abc", "de", "fg", "hi"]; + + uint cnt; + foreach (e; sp2) { + assert(w2[cnt++] == e); + } + assert(equal(sp2, w2)); +} + +unittest +{ + char[] s1 = ", abc, de, fg, hi, ".dup; + auto sp2 = splitter(s1, regex(", *")); +} + +unittest +{ + auto s1 = ", abc, de, fg, hi, "; + auto w1 = ["", "abc", "de", "fg", "hi", ""]; + assert(equal(split(s1, regex(", *")), w1[])); +} diff --git a/std/regex/internal/tests3.d b/std/regex/internal/tests3.d new file mode 100644 index 00000000000..3bd8cb8f336 --- /dev/null +++ b/std/regex/internal/tests3.d @@ -0,0 +1,305 @@ +/* + Regualar expressions package test suite part 3. +*/ +module std.regex.internal.tests3; + +package(std.regex): + +import std.algorithm, std.conv, std.exception, std.meta, std.range, + std.typecons, std.regex; + +unittest +{ // bugzilla 7141 + string pattern = `[a\--b]`; + assert(match("-", pattern)); + assert(match("b", pattern)); + string pattern2 = `[&-z]`; + assert(match("b", pattern2)); +} +unittest +{//bugzilla 7111 + assert(match("", regex("^"))); +} +unittest +{//bugzilla 7300 + assert(!match("a"d, "aa"d)); +} + +// bugzilla 7551 +unittest +{ + auto r = regex("[]abc]*"); + assert("]ab".matchFirst(r).hit == "]ab"); + assertThrown(regex("[]")); + auto r2 = regex("[]abc--ab]*"); + assert("]ac".matchFirst(r2).hit == "]"); +} + +unittest +{//bugzilla 7674 + assert("1234".replace(regex("^"), "$$") == "$1234"); + assert("hello?".replace(regex(r"\?", "g"), r"\?") == r"hello\?"); + assert("hello?".replace(regex(r"\?", "g"), r"\\?") != r"hello\?"); +} +unittest +{// bugzilla 7679 + foreach (S; AliasSeq!(string, wstring, dstring)) + (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 + const re = ctRegex!(to!S(r"\.")); + auto str = to!S("a.b"); + assert(equal(std.regex.splitter(str, re), [to!S("a"), to!S("b")])); + assert(split(str, re) == [to!S("a"), to!S("b")]); + }(); +} +unittest +{//bugzilla 8203 + string data = " + NAME = XPAW01_STA:STATION + NAME = XPAW01_STA + "; + auto uniFileOld = data; + auto r = regex( + r"^NAME = (?P[a-zA-Z0-9_]+):*(?P[a-zA-Z0-9_]*)","gm"); + auto uniCapturesNew = match(uniFileOld, r); + for (int i = 0; i < 20; i++) + foreach (matchNew; uniCapturesNew) {} + //a second issue with same symptoms + auto r2 = regex(`([а-яА-Я\-_]+\s*)+(?<=[\s\.,\^])`); + match("аллея Театральная", r2); +} +unittest +{// bugzilla 8637 purity of enforce + auto m = match("hello world", regex("world")); + enforce(m); +} + +// bugzilla 8725 +unittest +{ + static italic = regex( r"\* + (?!\s+) + (.*?) + (?!\s+) + \*", "gx" ); + string input = "this * is* interesting, *very* interesting"; + assert(replace(input, italic, "$1") == + "this * is* interesting, very interesting"); +} + +// bugzilla 8349 +unittest +{ + const peakRegexStr = r"\>(wgEncode.*Tfbs.*\.(?:narrow)|(?:broad)Peak.gz)"; + const peakRegex = ctRegex!(peakRegexStr); + //note that the regex pattern itself is probably bogus + assert(match(r"\>wgEncode-blah-Tfbs.narrow", peakRegex)); +} + +// bugzilla 9211 +unittest +{ + auto rx_1 = regex(r"^(\w)*(\d)"); + auto m = match("1234", rx_1); + assert(equal(m.front, ["1234", "3", "4"])); + auto rx_2 = regex(r"^([0-9])*(\d)"); + auto m2 = match("1234", rx_2); + assert(equal(m2.front, ["1234", "3", "4"])); +} + +// bugzilla 9280 +unittest +{ + string tomatch = "a!b@c"; + static r = regex(r"^(?P.*?)!(?P.*?)@(?P.*?)$"); + auto nm = match(tomatch, r); + assert(nm); + auto c = nm.captures; + assert(c[1] == "a"); + assert(c["nick"] == "a"); +} + + +// bugzilla 9579 +unittest +{ + char[] input = ['a', 'b', 'c']; + string format = "($1)"; + // used to give a compile error: + auto re = regex(`(a)`, "g"); + auto r = replace(input, re, format); + assert(r == "(a)bc"); +} + +// bugzilla 9634 +unittest +{ + auto re = ctRegex!"(?:a+)"; + assert(match("aaaa", re).hit == "aaaa"); +} + +//bugzilla 10798 +unittest +{ + auto cr = ctRegex!("[abcd--c]*"); + auto m = "abc".match(cr); + assert(m); + assert(m.hit == "ab"); +} + +// bugzilla 10913 +unittest +{ + @system static string foo(const(char)[] s) + { + return s.dup; + } + @safe static string bar(const(char)[] s) + { + return s.dup; + } + () @system { + replace!((a) => foo(a.hit))("blah", regex(`a`)); + }(); + () @safe { + replace!((a) => bar(a.hit))("blah", regex(`a`)); + }(); +} + +// bugzilla 11262 +unittest +{ + const reg = ctRegex!(r",", "g"); + auto str = "This,List"; + str = str.replace(reg, "-"); + assert(str == "This-List"); +} + +// bugzilla 11775 +unittest +{ + assert(collectException(regex("a{1,0}"))); +} + +// bugzilla 11839 +unittest +{ + assert(regex(`(?P\w+)`).namedCaptures.equal(["var1"])); + assert(collectException(regex(`(?P<1>\w+)`))); + assert(regex(`(?P\w+)`).namedCaptures.equal(["v1"])); + assert(regex(`(?P<__>\w+)`).namedCaptures.equal(["__"])); + assert(regex(`(?P<я>\w+)`).namedCaptures.equal(["я"])); +} + +// bugzilla 12076 +unittest +{ + auto RE = ctRegex!(r"(?abc)`); + assert(collectException("abc".matchFirst(r)["b"])); +} + +// bugzilla 12691 +unittest +{ + assert(bmatch("e@", "^([a-z]|)*$").empty); + assert(bmatch("e@", ctRegex!`^([a-z]|)*$`).empty); +} + +//bugzilla 12713 +unittest +{ + assertThrown(regex("[[a-z]([a-z]|(([[a-z])))")); +} + +//bugzilla 12747 +unittest +{ + assertThrown(regex(`^x(\1)`)); + assertThrown(regex(`^(x(\1))`)); + assertThrown(regex(`^((x)(?=\1))`)); +} + +// bugzilla 14504 +unittest +{ + auto p = ctRegex!("a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?" ~ + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); +} + +// bugzilla 14529 +unittest +{ + auto ctPat2 = regex(r"^[CDF]$", "i"); + foreach (v; ["C", "c", "D", "d", "F", "f"]) + assert(matchAll(v, ctPat2).front.hit == v); +} + +// bugzilla 14615 +unittest +{ + import std.stdio : writeln; + import std.regex : replaceFirst, replaceFirstInto, regex; + import std.array : appender; + + auto example = "Hello, world!"; + auto pattern = regex("^Hello, (bug)"); // won't find this one + auto result = replaceFirst(example, pattern, "$1 Sponge Bob"); + assert(result == "Hello, world!"); // Ok. + + auto sink = appender!string; + replaceFirstInto(sink, example, pattern, "$1 Sponge Bob"); + assert(sink.data == "Hello, world!"); + replaceAllInto(sink, example, pattern, "$1 Sponge Bob"); + assert(sink.data == "Hello, world!Hello, world!"); +} + +// bugzilla 15573 +unittest +{ + auto rx = regex("[c d]", "x"); + assert("a b".matchFirst(rx)); +} + +// bugzilla 15864 +unittest +{ + regex(`( counter + DataIndex genCounter; // merge trace counter, goes up on every dchar OpFunc[] opCacheTrue; // pointers to Op!(IR.xyz) for each bytecode OpFunc[] opCacheFalse; // ditto OpBackFunc[] opCacheBackTrue; // ditto OpBackFunc[] opCacheBackFalse; // ditto size_t threadSize; + size_t threadCount; int matched; bool exhausted; + const Kickstart!Char kickstart; + Group!DataIndex[] backrefed; + size_t[size_t] subCounters; // a table of gen counter per sub-engine: PC -> counter static struct State { @@ -799,7 +806,7 @@ template ThompsonOps(E,S, bool withInput:false) bool search() { - if (!s.search(re.kickstart, front, index)) + if (!s.search(kickstart, front, index)) { index = s.lastIndex; return false; @@ -808,24 +815,23 @@ template ThompsonOps(E,S, bool withInput:false) } } - void initExternalMemory(void[] memory) + void initExternalMemory(void[] memory, size_t hotspotTableSize) { - threadSize = getThreadSize(re); - prepareFreeList(re.threadCount, memory); - if (re.hotspotTableSize) + prepareFreeList(threadCount, memory); + if (hotspotTableSize) { - merge = arrayInChunk!(DataIndex)(re.hotspotTableSize, memory); + merge = arrayInChunk!(DataIndex)(hotspotTableSize, memory); merge[] = 0; } - opCacheTrue = arrayInChunk!(OpFunc)(re.ir.length, memory); - opCacheFalse = arrayInChunk!(OpFunc)(re.ir.length, memory); - opCacheBackTrue = arrayInChunk!(OpBackFunc)(re.ir.length, memory); - opCacheBackFalse = arrayInChunk!(OpBackFunc)(re.ir.length, memory); + opCacheTrue = arrayInChunk!(OpFunc)(ir.length, memory); + opCacheFalse = arrayInChunk!(OpFunc)(ir.length, memory); + opCacheBackTrue = arrayInChunk!(OpBackFunc)(ir.length, memory); + opCacheBackFalse = arrayInChunk!(OpBackFunc)(ir.length, memory); - for (uint pc = 0; pc 1) { auto app = appender!S(); @@ -344,10 +332,28 @@ public alias StaticRegex(Char) = std.regex.internal.ir.StaticRegex!(Char); } else pat = patterns[0]; + return regexImpl!S(pat, flags); +} +/++ + Compile regular expression pattern for the later execution. + Returns: $(D Regex) object that works on inputs having + the same character width as $(D pattern). + + Params: + pattern(s) = Regular expression(s) to match + flags = The _attributes (g, i, m and x accepted) + + Throws: $(D RegexException) if there were any errors during compilation. ++/ +@trusted public auto regex(S)(S[] patterns, const(char)[] flags="") + if (isSomeString!(S)) +{ + import std.functional : memoize; + enum cacheSize = 8; if (__ctfe) - return regexImpl(pat, flags); - return memoize!(regexImpl!S, cacheSize)(pat, flags); + return regexPure(patterns, flags); + return memoize!(regexPure!S, cacheSize)(patterns, flags); } ///ditto @@ -371,7 +377,7 @@ unittest assert(m.front[1] == "12"); } -public auto regexImpl(S)(S pattern, const(char)[] flags="") +private auto regexImpl(S)(S pattern, const(char)[] flags="") pure if (isSomeString!(S)) { import std.regex.internal.parser : Parser, CodeGen; @@ -381,19 +387,25 @@ public auto regexImpl(S)(S pattern, const(char)[] flags="") } -template ctRegexImpl(alias pattern, string flags=[]) +private template IsolatedFunc(Char, alias source) { - import std.regex.internal.parser, std.regex.internal.backtracking; - enum r = regex(pattern, flags); - alias Char = BasicElementOf!(typeof(pattern)); - enum source = ctGenRegExCode(r); + import std.regex.internal.backtracking; alias Matcher = BacktrackingMatcher!(true); - @trusted bool func(ref Matcher!Char matcher) + @trusted bool IsolatedFunc(ref Matcher!Char matcher) { debug(std_regex_ctr) pragma(msg, source); mixin(source); } - enum nr = StaticRegex!Char(r, &func); +} + +template ctRegexImpl(alias pattern, string flags=[]) +{ + import std.regex.internal.parser, std.regex.internal.backtracking; + static immutable r = cast(immutable)regexPure([pattern], flags); + alias Char = BasicElementOf!(typeof(pattern)); + enum source = ctGenRegExCode(r); + alias func = IsolatedFunc!(Char, source); + static immutable nr = immutable StaticRegex!Char(r, &func); } /++ @@ -406,7 +418,7 @@ template ctRegexImpl(alias pattern, string flags=[]) pattern = Regular expression flags = The _attributes (g, i, m and x accepted) +/ -public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).nr; +public static immutable ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).nr; enum isRegexFor(RegEx, R) = is(RegEx == Regex!(BasicElementOf!R)) || is(RegEx == StaticRegex!(BasicElementOf!R)); @@ -436,9 +448,9 @@ private: } uint _f, _b; uint _refcount; // ref count or SMALL MASK + num groups - NamedGroup[] _names; + const NamedGroup[] _names; - this()(R input, uint n, NamedGroup[] named) + this()(R input, uint n, const(NamedGroup)[] named) { _input = input; _names = named; @@ -447,16 +459,6 @@ private: _f = 0; } - this(alias Engine)(ref RegexMatch!(R,Engine) rmatch) - { - _input = rmatch._input; - _names = rmatch._engine.re.dict; - immutable n = rmatch._engine.re.ngroup; - newMatches(n); - _b = n; - _f = 0; - } - @property inout(Group!DataIndex[]) matches() inout { return (_refcount & SMALL_MASK) ? small_matches[0 .. _refcount & 0xFF] : big_matches; @@ -660,21 +662,24 @@ private: alias EngineType = Engine!Char; EngineType _engine; R _input; + uint _ngroup; Captures!(R,EngineType.DataIndex) _captures; void[] _memory;//is ref-counted - this(RegEx)(R input, RegEx prog) + this(RegEx)(R input, RegEx prog, uint reFlags) { import std.exception : enforce; _input = input; + _ngroup = prog.ngroup; immutable size = EngineType.initialMemory(prog)+size_t.sizeof; _memory = (enforce(malloc(size), "malloc failed")[0..size]); scope(failure) free(_memory.ptr); *cast(size_t*)_memory.ptr = 1; - _engine = EngineType(prog, Input!Char(input), _memory[size_t.sizeof..$]); - static if (is(RegEx == StaticRegex!(BasicElementOf!R))) + _engine = EngineType(prog, Input!Char(input), + _memory[size_t.sizeof..$], reFlags); + static if (is(typeof(prog.nativeFn))) _engine.nativeFn = prog.nativeFn; - _captures = Captures!(R,EngineType.DataIndex)(this); + _captures = Captures!(R,EngineType.DataIndex)(input, prog.ngroup, prog.dict); _captures._nMatch = _engine.match(_captures.matches); debug(std_regex_allocation) writefln("RefCount (ctor): %x %d", _memory.ptr, counter); } @@ -743,16 +748,16 @@ public: if (counter != 1) {//do cow magic first counter--;//we abandon this reference - immutable size = EngineType.initialMemory(_engine.re)+size_t.sizeof; + immutable size = _memory.length; _memory = (enforce(malloc(size), "malloc failed")[0..size]); - _engine = _engine.dupTo(_memory[size_t.sizeof..size]); + _engine.dupTo(_memory[size_t.sizeof..size]); counter = 1;//points to new chunk } if (!_captures.unique) { // has external references - allocate new space - _captures.newMatches(_engine.re.ngroup); + _captures.newMatches(_ngroup); } _captures._nMatch = _engine.match(_captures.matches); } @@ -771,7 +776,7 @@ public: } -private @trusted auto matchOnce(alias Engine, RegEx, R)(R input, RegEx re) +private @trusted auto matchOnce(alias Engine, RegEx, R)(R input, const RegEx re) { import core.stdc.stdlib : malloc, free; import std.exception : enforce; @@ -782,17 +787,16 @@ private @trusted auto matchOnce(alias Engine, RegEx, R)(R input, RegEx re) void[] memory = enforce(malloc(size), "malloc failed")[0..size]; scope(exit) free(memory.ptr); auto captures = Captures!(R, EngineType.DataIndex)(input, re.ngroup, re.dict); - auto engine = EngineType(re, Input!Char(input), memory); - static if (is(RegEx == StaticRegex!(BasicElementOf!R))) + auto engine = EngineType(re, Input!Char(input), memory, re.flags); + static if (is(typeof(re.nativeFn))) engine.nativeFn = re.nativeFn; captures._nMatch = engine.match(captures.matches); return captures; } -private auto matchMany(alias Engine, RegEx, R)(R input, RegEx re) +private auto matchMany(alias Engine, RegEx, R)(R input, const RegEx re) { - re.flags |= RegexOption.global; - return RegexMatch!(R, Engine)(input, re); + return RegexMatch!(R, Engine)(input, re, re.flags | RegexOption.global); } unittest @@ -847,7 +851,7 @@ private void replaceMatchesInto(alias output, Sink, R, T) } // a general skeleton of replaceFirst -private R replaceFirstWith(alias output, R, RegEx)(R input, RegEx re) +private R replaceFirstWith(alias output, R, RegEx)(R input, const RegEx re) if (isSomeString!R && isRegexFor!(RegEx, R)) { import std.array : appender; @@ -862,7 +866,7 @@ private R replaceFirstWith(alias output, R, RegEx)(R input, RegEx re) // ditto for replaceAll // the method parameter allows old API to ride on the back of the new one private R replaceAllWith(alias output, - alias method=matchAll, R, RegEx)(R input, RegEx re) + alias method=matchAll, R, RegEx)(R input, const RegEx re) if (isSomeString!R && isRegexFor!(RegEx, R)) { import std.array : appender; @@ -891,11 +895,12 @@ private R replaceAllWith(alias output, Returns: a $(D RegexMatch) object holding engine state after first match. +/ -public auto match(R, RegEx)(R input, RegEx re) +public auto match(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) { import std.regex.internal.thompson : ThompsonMatcher; - return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, re); + return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher) + (input, re, re.flags); } ///ditto @@ -903,14 +908,17 @@ public auto match(R, String)(R input, String re) if (isSomeString!R && isSomeString!String) { import std.regex.internal.thompson : ThompsonMatcher; - return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, regex(re)); + auto r = regex(re); + return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher) + (input, r, r.flags); } -public auto match(R, RegEx)(R input, RegEx re) +public auto match(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) { import std.regex.internal.backtracking : BacktrackingMatcher; - return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re); + return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true) + (input, re, re.flags); } /++ @@ -931,7 +939,7 @@ public auto match(R, RegEx)(R input, RegEx re) $(LREF Captures) containing the extent of a match together with all submatches if there was a match, otherwise an empty $(LREF Captures) object. +/ -public auto matchFirst(R, RegEx)(R input, RegEx re) +public auto matchFirst(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) { import std.regex.internal.thompson : ThompsonMatcher; @@ -954,7 +962,7 @@ public auto matchFirst(R, String)(R input, String[] re...) return matchOnce!ThompsonMatcher(input, regex(re)); } -public auto matchFirst(R, RegEx)(R input, RegEx re) +public auto matchFirst(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) { import std.regex.internal.backtracking : BacktrackingMatcher; @@ -982,7 +990,7 @@ public auto matchFirst(R, RegEx)(R input, RegEx re) $(LREF RegexMatch) object that represents matcher state after the first match was found or an empty one if not present. +/ -public auto matchAll(R, RegEx)(R input, RegEx re) +public auto matchAll(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) { import std.regex.internal.thompson : ThompsonMatcher; @@ -1005,7 +1013,7 @@ public auto matchAll(R, String)(R input, String[] re...) return matchMany!ThompsonMatcher(input, regex(re)); } -public auto matchAll(R, RegEx)(R input, RegEx re) +public auto matchAll(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) { import std.regex.internal.backtracking : BacktrackingMatcher; @@ -1022,7 +1030,7 @@ public auto matchAll(R, RegEx)(R input, RegEx re) foreach (String; AliasSeq!(string, wstring, const(dchar)[])) { auto str1 = "blah-bleh".to!String(); - auto pat1 = "bl[ae]h".to!String(); + const pat1 = "bl[ae]h".to!String(); auto mf = matchFirst(str1, pat1); assert(mf.equal(["blah".to!String()])); auto mAll = matchAll(str1, pat1); @@ -1030,7 +1038,7 @@ public auto matchAll(R, RegEx)(R input, RegEx re) ([["blah".to!String()], ["bleh".to!String()]])); auto str2 = "1/03/12 - 3/03/12".to!String(); - auto pat2 = regex([r"(\d+)/(\d+)/(\d+)".to!String(), "abc".to!String]); + const pat2 = regex([r"(\d+)/(\d+)/(\d+)".to!String(), "abc".to!String]); auto mf2 = matchFirst(str2, pat2); assert(mf2.equal(["1/03/12", "1", "03", "12"].map!(to!String)())); auto mAll2 = matchAll(str2, pat2); @@ -1040,7 +1048,7 @@ public auto matchAll(R, RegEx)(R input, RegEx re) mf2.popFrontN(3); assert(mf2.equal(["12".to!String()])); - auto ctPat = ctRegex!(`(?P\d+)/(?P\d+)`.to!String()); + const ctPat = ctRegex!(`(?P\d+)/(?P\d+)`.to!String()); auto str = "2 + 34/56 - 6/1".to!String(); auto cmf = matchFirst(str, ctPat); assert(cmf.equal(["34/56", "34", "56"].map!(to!String)())); @@ -1071,11 +1079,12 @@ public auto matchAll(R, RegEx)(R input, RegEx re) state after first match. +/ -public auto bmatch(R, RegEx)(R input, RegEx re) +public auto bmatch(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) { import std.regex.internal.backtracking : BacktrackingMatcher; - return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, re); + return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false) + (input, re, re.flags); } ///ditto @@ -1083,14 +1092,17 @@ public auto bmatch(R, String)(R input, String re) if (isSomeString!R && isSomeString!String) { import std.regex.internal.backtracking : BacktrackingMatcher; - return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, regex(re)); + auto r = regex(re); + return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false) + (input, r, r.flags); } -public auto bmatch(R, RegEx)(R input, RegEx re) +public auto bmatch(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) { import std.regex.internal.backtracking : BacktrackingMatcher; - return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re); + return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true) + (input, re, re.flags); } // produces replacement string from format using captures for substitution @@ -1183,7 +1195,7 @@ L_Replace_Loop: A string of the same type with the first match (if any) replaced. If no match is found returns the input string itself. +/ -public R replaceFirst(R, C, RegEx)(R input, RegEx re, const(C)[] format) +public R replaceFirst(R, C, RegEx)(R input, const RegEx re, const(C)[] format) if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) { return replaceFirstWith!((m, sink) => replaceFmt(format, m, sink))(input, re); @@ -1210,7 +1222,7 @@ unittest replaced by return values of $(D fun). If no matches found returns the $(D input) itself. +/ -public R replaceFirst(alias fun, R, RegEx)(R input, RegEx re) +public R replaceFirst(alias fun, R, RegEx)(R input, const RegEx re) if (isSomeString!R && isRegexFor!(RegEx, R)) { return replaceFirstWith!((m, sink) => sink.put(fun(m)))(input, re); @@ -1236,7 +1248,7 @@ unittest and the one with the user defined callback. +/ public @trusted void replaceFirstInto(Sink, R, C, RegEx) - (ref Sink sink, R input, RegEx re, const(C)[] format) + (ref Sink sink, R input, const RegEx re, const(C)[] format) if (isOutputRange!(Sink, dchar) && isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) { @@ -1246,7 +1258,7 @@ public @trusted void replaceFirstInto(Sink, R, C, RegEx) ///ditto public @trusted void replaceFirstInto(alias fun, Sink, R, RegEx) - (Sink sink, R input, RegEx re) + (Sink sink, R input, const RegEx re) if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) { replaceCapturesInto!fun(sink, input, matchFirst(input, re)); @@ -1265,24 +1277,6 @@ unittest assert(result.data == "first\nsecond\n"); } -//examples for replaceFirst -@system unittest -{ - import std.conv; - string list = "#21 out of 46"; - string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1)) - (list, regex(`[0-9]+`)); - assert(newList == "#22 out of 46"); - import std.array; - string m1 = "first message\n"; - string m2 = "second message\n"; - auto result = appender!string(); - replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1"); - //equivalent of the above with user-defined callback - replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`)); - assert(result.data == "first\nsecond\n"); -} - /++ Construct a new string from $(D input) by replacing all of the fragments that match a pattern $(D re) with a string generated @@ -1301,7 +1295,7 @@ unittest of the matches (if any) replaced. If no match is found returns the input string itself. +/ -public @trusted R replaceAll(R, C, RegEx)(R input, RegEx re, const(C)[] format) +public @trusted R replaceAll(R, C, RegEx)(R input, const RegEx re, const(C)[] format) if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) { return replaceAllWith!((m, sink) => replaceFmt(format, m, sink))(input, re); @@ -1311,7 +1305,7 @@ public @trusted R replaceAll(R, C, RegEx)(R input, RegEx re, const(C)[] format) unittest { // insert comma as thousands delimiter - auto re = regex(r"(?<=\d)(?=(\d\d\d)+\b)","g"); + const re = regex(r"(?<=\d)(?=(\d\d\d)+\b)","g"); assert(replaceAll("12000 + 42100 = 54100", re, ",") == "12,000 + 42,100 = 54,100"); } @@ -1335,7 +1329,7 @@ unittest re = compiled regular expression fun = delegate to use +/ -public @trusted R replaceAll(alias fun, R, RegEx)(R input, RegEx re) +public @trusted R replaceAll(alias fun, R, RegEx)(R input, const RegEx re) if (isSomeString!R && isRegexFor!(RegEx, R)) { return replaceAllWith!((m, sink) => sink.put(fun(m)))(input, re); @@ -1364,7 +1358,7 @@ unittest the other one with a user defined functor. +/ public @trusted void replaceAllInto(Sink, R, C, RegEx) - (Sink sink, R input, RegEx re, const(C)[] format) + (Sink sink, R input, const RegEx re, const(C)[] format) if (isOutputRange!(Sink, dchar) && isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) { @@ -1374,7 +1368,7 @@ public @trusted void replaceAllInto(Sink, R, C, RegEx) ///ditto public @trusted void replaceAllInto(alias fun, Sink, R, RegEx) - (Sink sink, R input, RegEx re) + (Sink sink, R input, const RegEx re) if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) { replaceMatchesInto!fun(sink, input, matchAll(input, re)); @@ -1411,8 +1405,8 @@ public @trusted void replaceAllInto(alias fun, Sink, R, RegEx) S t2F = "hound dome".to!S(); S t1A = "court trial".to!S(); S t2A = "hound home".to!S(); - auto re1 = regex("curt".to!S()); - auto re2 = regex("[dr]o".to!S()); + const re1 = regex("curt".to!S()); + const re2 = regex("[dr]o".to!S()); assert(replaceFirst(s1, re1, "court") == t1F); assert(replaceFirst(s2, re2, "ho") == t2F); @@ -1446,14 +1440,14 @@ public @trusted void replaceAllInto(alias fun, Sink, R, RegEx) The use of this function is $(RED discouraged), please use $(LREF replaceAll) or $(LREF replaceFirst) explicitly. +/ -public R replace(alias scheme = match, R, C, RegEx)(R input, RegEx re, const(C)[] format) +public R replace(alias scheme = match, R, C, RegEx)(R input, const RegEx re, const(C)[] format) if (isSomeString!R && isRegexFor!(RegEx, R)) { return replaceAllWith!((m, sink) => replaceFmt(format, m, sink), match)(input, re); } ///ditto -public R replace(alias fun, R, RegEx)(R input, RegEx re) +public R replace(alias fun, R, RegEx)(R input, const RegEx re) if (isSomeString!R && isRegexFor!(RegEx, R)) { return replaceAllWith!(fun, match)(input, re); @@ -1475,15 +1469,14 @@ public struct Splitter(Flag!"keepSeparators" keepSeparators = No.keepSeparators, private: Range _input; size_t _offset; - alias Rx = typeof(match(Range.init,RegEx.init)); + alias Rx = typeof(matchAll(Range.init,RegEx.init)); Rx _match; static if (keepSeparators) bool onMatch = false; - @trusted this(Range input, RegEx separator) + @trusted this(Range input, const RegEx separator) {//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted _input = input; - separator.flags |= RegexOption.global; if (_input.empty) { //there is nothing to match at all, make _offset > 0 @@ -1491,7 +1484,7 @@ private: } else { - _match = Rx(_input, separator); + _match = matchAll(_input, separator); static if (keepSeparators) if (_match.pre.empty) @@ -1579,8 +1572,9 @@ public: /// ditto public Splitter!(keepSeparators, Range, RegEx) splitter( - Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, RegEx)(Range r, RegEx pat) if ( - is(BasicElementOf!Range : dchar) && isRegexFor!(RegEx, Range)) + Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, RegEx) + (Range r, const RegEx pat) + if (is(BasicElementOf!Range : dchar) && isRegexFor!(RegEx, Range)) { return Splitter!(keepSeparators, Range, RegEx)(r, pat); } @@ -1599,7 +1593,7 @@ unittest { import std.algorithm.comparison : equal; - auto pattern = regex(`([\.,])`); + const pattern = regex(`([\.,])`); assert("2003.04.05" .splitter!(Yes.keepSeparators)(pattern) @@ -1611,7 +1605,7 @@ unittest } ///An eager version of $(D splitter) that creates an array with splitted slices of $(D input). -public @trusted String[] split(String, RegEx)(String input, RegEx rx) +public @trusted String[] split(String, RegEx)(String input, const RegEx rx) if (isSomeString!String && isRegexFor!(RegEx, String)) { import std.array : appender; diff --git a/std/uni.d b/std/uni.d index 8b7fa32fdcb..84c7ee31399 100644 --- a/std/uni.d +++ b/std/uni.d @@ -2120,19 +2120,6 @@ public: assert(!gothic['$']); } - // Linear scan for $(D ch). Useful only for small sets. - // TODO: - // used internally in std.regex - // should be properly exposed in a public API ? - package auto scanFor()(dchar ch) const - { - immutable len = data.length; - for (size_t i = 0; i < len; i++) - if (ch < data[i]) - return i & 1; - return 0; - } - /// Number of $(CODEPOINTS) in this set @property size_t length() { diff --git a/win32.mak b/win32.mak index 38cc8242fc8..1d4a6409795 100644 --- a/win32.mak +++ b/win32.mak @@ -220,12 +220,19 @@ SRC_STD_RANGE= \ SRC_STD_REGEX= \ std\regex\internal\ir.d \ std\regex\package.d \ - std\regex\internal\parser.d \ std\regex\internal\tests.d \ + std\regex\internal\generator.d + +SRC_STD_REGEX_2 = \ + std\regex\internal\parser.d \ std\regex\internal\backtracking.d \ std\regex\internal\thompson.d \ - std\regex\internal\kickstart.d \ - std\regex\internal\generator.d + std\regex\internal\tests2.d + +SRC_STD_REGEX_3 = \ + std\regex\internal\shiftor.d \ + std\regex\internal\bitnfa.d \ + std\regex\internal\tests3.d SRC_STD_C= \ std\c\process.d \ @@ -352,6 +359,8 @@ SRC_TO_COMPILE= \ $(SRC_STD_NET) \ $(SRC_STD_RANGE) \ $(SRC_STD_REGEX) \ + $(SRC_STD_REGEX_2) \ + $(SRC_STD_REGEX_3) \ $(SRC_STD_C) \ $(SRC_STD_WIN) \ $(SRC_STD_C_WIN) \ @@ -571,6 +580,8 @@ UNITTEST_OBJS= \ unittest8d.obj \ unittest8e.obj \ unittest8f.obj \ + unittest8g.obj \ + unittest8h.obj \ unittest9a.obj unittest : $(LIB) @@ -585,11 +596,13 @@ unittest : $(LIB) $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest6.obj $(SRC_STD_6) $(SRC_STD_CONTAINER) $(SRC_STD_EXP_ALLOC) $(SRC_STD_EXP_LOGGER) $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest7.obj $(SRC_STD_7) $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8a.obj $(SRC_STD_REGEX) - $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8b.obj $(SRC_STD_NET) - $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8c.obj $(SRC_STD_C) $(SRC_STD_WIN) $(SRC_STD_C_WIN) - $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8d.obj $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS) - $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8e.obj $(SRC_ETC) $(SRC_ETC_C) - $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8f.obj $(SRC_STD_EXP) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8b.obj $(SRC_STD_REGEX_2) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8c.obj $(SRC_STD_REGEX_3) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8d.obj $(SRC_STD_NET) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8e.obj $(SRC_STD_C) $(SRC_STD_WIN) $(SRC_STD_C_WIN) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8f.obj $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8g.obj $(SRC_ETC) $(SRC_ETC_C) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8h.obj $(SRC_STD_EXP) $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest9a.obj $(SRC_STD_EXP_NDSLICE) $(DMD) $(UDFLAGS) -L/co -unittest unittest.d $(UNITTEST_OBJS) \ $(ZLIB) $(DRUNTIMELIB) diff --git a/win64.mak b/win64.mak index 838595e31c2..9c2a2a487aa 100644 --- a/win64.mak +++ b/win64.mak @@ -239,12 +239,19 @@ SRC_STD_RANGE= \ SRC_STD_REGEX= \ std\regex\internal\ir.d \ std\regex\package.d \ - std\regex\internal\parser.d \ std\regex\internal\tests.d \ + std\regex\internal\generator.d + +SRC_STD_REGEX_2 = \ + std\regex\internal\parser.d \ std\regex\internal\backtracking.d \ std\regex\internal\thompson.d \ - std\regex\internal\kickstart.d \ - std\regex\internal\generator.d + std\regex\internal\tests2.d + +SRC_STD_REGEX_3 = \ + std\regex\internal\shiftor.d \ + std\regex\internal\bitnfa.d \ + std\regex\internal\tests3.d SRC_STD_C= \ std\c\process.d \ @@ -371,6 +378,8 @@ SRC_TO_COMPILE= \ $(SRC_STD_NET) \ $(SRC_STD_RANGE) \ $(SRC_STD_REGEX) \ + $(SRC_STD_REGEX_2) \ + $(SRC_STD_REGEX_3) \ $(SRC_STD_C) \ $(SRC_STD_WIN) \ $(SRC_STD_C_WIN) \ @@ -621,11 +630,13 @@ unittest : $(LIB) $(DMD) $(UDFLAGS) -c -unittest -ofunittest6i.obj $(SRC_STD_6i) $(DMD) $(UDFLAGS) -c -unittest -ofunittest7.obj $(SRC_STD_7) $(SRC_STD_EXP_LOGGER) $(DMD) $(UDFLAGS) -c -unittest -ofunittest8a.obj $(SRC_STD_REGEX) - $(DMD) $(UDFLAGS) -c -unittest -ofunittest8b.obj $(SRC_STD_NET) - $(DMD) $(UDFLAGS) -c -unittest -ofunittest8c.obj $(SRC_STD_C) $(SRC_STD_WIN) $(SRC_STD_C_WIN) - $(DMD) $(UDFLAGS) -c -unittest -ofunittest8d.obj $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS) - $(DMD) $(UDFLAGS) -c -unittest -ofunittest8e.obj $(SRC_ETC) $(SRC_ETC_C) - $(DMD) $(UDFLAGS) -c -unittest -ofunittest8f.obj $(SRC_STD_EXP) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8b.obj $(SRC_STD_REGEX_2) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8c.obj $(SRC_STD_REGEX_3) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8d.obj $(SRC_STD_NET) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8e.obj $(SRC_STD_C) $(SRC_STD_WIN) $(SRC_STD_C_WIN) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8f.obj $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8g.obj $(SRC_ETC) $(SRC_ETC_C) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8h.obj $(SRC_STD_EXP) $(DMD) $(UDFLAGS) -c -unittest -ofunittest9.obj $(SRC_STD_EXP_ALLOC) $(DMD) $(UDFLAGS) -c -unittest -ofunittest9a.obj $(SRC_STD_EXP_NDSLICE) $(DMD) $(UDFLAGS) -L/OPT:NOICF -unittest unittest.d $(UNITTEST_OBJS) \