From 01c2744643d3b298566f9ea1e8086193193e4942 Mon Sep 17 00:00:00 2001 From: nitely Date: Thu, 11 Apr 2024 16:24:17 -0300 Subject: [PATCH] alternations optimization --- .gitignore | 1 + regex.nimble | 1 + src/regex/altopt.nim | 224 +++++++++++++++++++++++++++++++++++++++++++ src/regex/common.nim | 9 +- src/regex/types.nim | 20 ++-- 5 files changed, 244 insertions(+), 11 deletions(-) create mode 100644 src/regex/altopt.nim diff --git a/.gitignore b/.gitignore index 847caa35..7ed5ec05 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ nimcache/ src/regex.js src/regex.out +src/regex/altopt src/regex/litopt src/regex/nfatype tests/tests diff --git a/regex.nimble b/regex.nimble index bdd84fd9..a8f65512 100644 --- a/regex.nimble +++ b/regex.nimble @@ -13,6 +13,7 @@ requires "unicodedb >= 0.7.2" task test2, "Test": exec "nim c -r -o:bin/regex src/regex.nim" exec "nim c -r -o:bin/litopt src/regex/litopt.nim" + exec "nim c -r -o:bin/altopt src/regex/altopt.nim" exec "nim c -r -o:bin/nfatype src/regex/nfatype.nim" exec "nim c -r tests/tests2.nim" exec "nim c -r -d:forceRegexAtRuntime tests/tests2.nim" diff --git a/src/regex/altopt.nim b/src/regex/altopt.nim new file mode 100644 index 00000000..ba65e2b8 --- /dev/null +++ b/src/regex/altopt.nim @@ -0,0 +1,224 @@ + +# xxx rename to oropt + +import std/tables +import std/unicode + +import ./types + +# find OR +# get lit - repeat +# end of lit is EOE or group close +# extract common lit for every term +# reconstruct regex with ORs to substr + +func goesToOr(eNfa: Enfa, s: int16): bool = + result = false + for ss in eNfa.s[s].next: + if eNfa.s[ss].kind == reOr: + return true + +func nextOr(eNfa: Enfa, s: int16): int16 = + for ss in eNfa.s[s].next: + if eNfa.s[ss].kind == reOr: + return ss + doAssert false + +func lastOf(eNfa: Enfa, s: int16): (int16, int16) = + doAssert eNfa.s[s].kind notin {reEoe, reGroupEnd} + var gSymCount = 0 + var s = s + var ps = s + while true: + doAssert eNfa.s[s].next.len <= 2 + if eNfa.s[s].kind == reEoe: + return (ps, s) + if eNfa.s[s].kind == reGroupEnd: + if gSymCount == 0: + return (ps, s) + dec gSymCount + if eNfa.s[s].kind == reGroupStart: + inc gSymCount + ps = s + if eNfa.s[s].next.len == 2: # xxx fix greedy + s = eNfa.s[s].next[0] + else: + s = eNfa.s[s].next[0] + +func endOfOr(eNfa: Enfa, s: int16): int16 = + doAssert eNfa.s[s].kind == reOr + let (_, tend) = eNfa.lastOf(s) + return tend + +func term(eNfa: Enfa, s: int16): int16 = + doAssert eNfa.s[s].kind == reOr + return eNfa.s[s].next[0] + +func term2(eNfa: Enfa, s: int16): int16 = + ## grab second term if not OR + doAssert eNfa.s[s].kind == reOr + let ss = eNfa.s[s].next[1] + if eNfa.s[ss].kind != reOr: + return ss + return eNfa.s[s].next[0] + +func eatOneState(eNfa: var Enfa, s: int16) = + doAssert eNfa.s[s].kind == reOr + let ss = eNfa.s[s].next[0] + doAssert eNfa.s[ss].kind == reChar + doAssert eNfa.s[ss].next.len == 1 + eNfa.s[s].next[0] = eNfa.s[ss].next[0] + +func eatOneState2(eNfa: var Enfa, s: int16) = + doAssert eNfa.s[s].kind == reOr + let ss = eNfa.s[s].next[1] + if eNfa.s[ss].kind == reOr: + eatOneState(eNfa, s) + return + doAssert eNfa.s[ss].kind == reChar + doAssert eNfa.s[ss].next.len == 1 + eNfa.s[s].next[1] = eNfa.s[ss].next[0] + +func altOpt2( + eNfa: var Enfa, + state: int16, + repStates: var Table[int16, int16] +) = + var s = state + if eNfa.s[s].kind == reOr: + var ors = @[s] + while eNfa.goesToOr(s): + s = eNfa.nextOr(s) + ors.add s + ors.add s + #debugEcho ors + s = eNfa.endOfOr(s) + #debugEcho repr(eNfa.s[s]) + doAssert eNfa.s[s].kind != reOr + var i = 0 + while i < ors.len-1: + var t = enfa.term(ors[i]) + if enfa.s[t].kind != reChar: + inc i + continue + #debugEcho repr(enfa.s[t]) + var i2 = i + let cp = enfa.s[t].cp + while i2 < ors.len-1: + var t2 = enfa.term2(ors[i2+1]) + if enfa.s[t2].kind != reChar: break + if enfa.s[t2].cp != cp: break + inc i2 + #debugEcho i2 + if i2 > i: + #debugEcho "OR transformation" + enfa.s.add toCharNode(cp) + let cpIdx = (enfa.s.len-1).int16 + enfa.s.add initGroupStart(isCapturing = false) + let gsIdx = (enfa.s.len-1).int16 + repStates[ors[i]] = cpIdx + enfa.s[gsIdx].next.add ors[i] + enfa.s[cpIdx].next.add gsIdx + enfa.s.add Node(kind: reGroupEnd, cp: ')'.ord.Rune) + let geIdx = (enfa.s.len-1).int16 + let (tlast, tend) = eNfa.lastOf enfa.term2(ors[i2]) + #debugEcho repr(enfa.s[tlast]) + #debugEcho repr(enfa.s[tend]) + enfa.s[geIdx].next.add tend + for ii in 0 .. enfa.s[tlast].next.len-1: + if enfa.s[tlast].next[ii] == tend: + enfa.s[tlast].next[ii] = geIdx + for i3 in i .. i2-1: + enfa.eatOneState(ors[i3]) + enfa.eatOneState2(ors[i2]) + if eNfa.goesToOr(ors[i2]): + enfa.s[ors[i2-1]].next[1] = enfa.s[ors[i2]].next[0] + enfa.s[ors[i2]].next[0] = cpIdx + repStates[ors[i]] = ors[i2] + i = max(i2+1, i+1) + #break + doAssert eNfa.s[s].kind != reOr + if eNfa.s[s].kind in repetitionKind: + doAssert eNfa.s[s].next.len == 2 + altOpt2(eNfa, eNfa.s[s].next[0], repStates) # xxx fix greedy + else: + doAssert eNfa.s[s].next.len <= 1 + for ss in eNfa.s[s].next: + altOpt2(eNfa, ss, repStates) + +proc altOpt2(eNfa: Enfa): Enfa = + #debugEcho "altOpt2" + result = eNfa + let start = int16(eNfa.s.len-1) + var repStates = initTable[int16, int16]() + altOpt2(result, start, repStates) + doAssert eNfa.s[start].kind == reSkip + for s in 0 .. start: + for i in 0 .. result.s[s].next.len-1: + let ss = result.s[s].next[i] + if ss in repStates: + result.s[s].next[i] = repStates[ss] + result.s.add result.s[start] + #debugEcho repr(result.s[result.s.len-1]) + #debugEcho repr(result.s[10]) + #debugEcho repr(result.s[11]) + +when isMainModule: + import ./parser + import ./exptransformation + import ./nfa + + func altopt(s: string): Nfa = + let flags: RegexFlags = {} + var groups: GroupsCapture + let rpn = s + .parse(flags) + .transformExp(groups, flags) + result = rpn + .subExps + .eNfa + .altOpt2 + .eRemoval + + func toNfa(s: string): Nfa = + let flags: RegexFlags = {} + var groups: GroupsCapture + result = s + .parse(flags) + .transformExp(groups, flags) + .nfa2 + + proc toString( + nfa: Nfa, + nIdx: int16, + visited: var set[int16] + ): string = + # XXX zero-match transitions are missing + if nfa.s[nIdx].kind == reEoe: + result = "eoe" + return + if nIdx in visited: + result = "[...]" + return + visited.incl nIdx + let n = nfa.s[nIdx] + result = "[" + result.add $n.cp + for nn in n.next: + if isEpsilonTransition(nfa.s[nn]): + continue + result.add ", " + result.add toString(nfa, nn, visited) + result.add "]" + + proc toString(nfa: Nfa): string {.used.} = + var visited: set[int16] + result = toString(nfa, 0, visited) + + doAssert r"abc".altopt.toString == r"abc".toNfa.toString + doAssert r"a|b".altopt.toString == r"a|b".toNfa.toString + doAssert r"ab|ab|ab".altopt.toString == r"a(b|b|b)".toNfa.toString + doAssert r"ab|ab".altopt.toString == r"a(b|b)".toNfa.toString + doAssert r"ab|ab|bc|bc".altopt.toString == r"a(b|b)|b(c|c)".toNfa.toString + doAssert r"ab|ab|bc|bc".altopt.toString != r"a(b|b|b(c|c))".toNfa.toString + echo "ok altopt.nim" diff --git a/src/regex/common.nim b/src/regex/common.nim index add006ec..5135b0e1 100644 --- a/src/regex/common.nim +++ b/src/regex/common.nim @@ -26,13 +26,16 @@ const # Nim and not the actual `\n` lineBreakRune* = Rune(10) -proc toRune*(s: string): Rune = +func toRune*(s: string): Rune {.inline.} = result = s.runeAt(0) -proc `<=`*(x, y: Rune): bool = +func toRune*(c: char): Rune {.inline.} = + result = c.ord.Rune + +func `<=`*(x, y: Rune): bool {.inline.} = x.int <= y.int -proc cmp*(x, y: Rune): int = +func cmp*(x, y: Rune): int = x.int - y.int func bwRuneAt*(s: string, n: int): Rune = diff --git a/src/regex/types.nim b/src/regex/types.nim index e4805ce4..a2243ce9 100644 --- a/src/regex/types.nim +++ b/src/regex/types.nim @@ -136,22 +136,23 @@ func initJoinerNode*(): Node = ## Joiners are temporary nodes, ## they serve to generate the NFA ## but they are never part of it - Node(kind: reJoiner, cp: "~".toRune) + Node(kind: reJoiner, cp: '~'.toRune) func initEoeNode*(): Node = ## return the end-of-expression ``Node``. ## This is a dummy node that marks a match as successful - Node(kind: reEoe, cp: "#".toRune) + Node(kind: reEoe, cp: '#'.toRune) template initSetNodeImpl(result: var Node, k: NodeKind) = ## base node assert k in {reInSet, reNotSet} result = Node( kind: k, - cp: "#".toRune, + cp: '#'.toRune, cps: initHashSet[Rune](2), ranges: @[], - shorthands: @[]) + shorthands: @[] + ) func initSetNode*(): Node = ## return a set ``Node``, @@ -171,7 +172,7 @@ func initGroupStart*( ## return a ``reGroupStart`` node Node( kind: reGroupStart, - cp: "(".toRune, + cp: '('.toRune, name: name, flags: flags, isCapturing: isCapturing) @@ -179,14 +180,16 @@ func initGroupStart*( func initSkipNode*(): Node = result = Node( kind: reSkip, - cp: "#".toRune) + cp: '#'.toRune) func initSkipNode*(next: openArray[int16]): Node = ## Return a dummy node that should be skipped ## while traversing the NFA result = Node( kind: reSkip, - cp: "#".toRune, next: toSeq(next)) + cp: '#'.toRune, + next: toSeq(next) + ) func isEmpty*(n: Node): bool = ## check if a set ``Node`` is empty @@ -194,7 +197,8 @@ func isEmpty*(n: Node): bool = result = ( n.cps.len == 0 and n.ranges.len == 0 and - n.shorthands.len == 0) + n.shorthands.len == 0 + ) const opKind* = {