Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alternations optimization #141

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
nimcache/
src/regex.js
src/regex.out
src/regex/altopt
src/regex/litopt
src/regex/nfatype
tests/tests
Expand Down
1 change: 1 addition & 0 deletions regex.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ requires "unicodedb >= 0.7.2"
task test2, "Test":
exec "nim c -r -o:bin/regex src/regex.nim"
exec "nim c -r -o:bin/litopt src/regex/litopt.nim"
exec "nim c -r -o:bin/altopt src/regex/altopt.nim"
exec "nim c -r -o:bin/nfatype src/regex/nfatype.nim"
exec "nim c -r tests/tests2.nim"
exec "nim c -r -d:forceRegexAtRuntime tests/tests2.nim"
Expand Down
224 changes: 224 additions & 0 deletions src/regex/altopt.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@

# xxx rename to oropt

import std/tables
import std/unicode

import ./types

# find OR
# get lit - repeat
# end of lit is EOE or group close
# extract common lit for every term
# reconstruct regex with ORs to substr

func goesToOr(eNfa: Enfa, s: int16): bool =
result = false
for ss in eNfa.s[s].next:
if eNfa.s[ss].kind == reOr:
return true

func nextOr(eNfa: Enfa, s: int16): int16 =
for ss in eNfa.s[s].next:
if eNfa.s[ss].kind == reOr:
return ss
doAssert false

func lastOf(eNfa: Enfa, s: int16): (int16, int16) =
doAssert eNfa.s[s].kind notin {reEoe, reGroupEnd}
var gSymCount = 0
var s = s
var ps = s
while true:
doAssert eNfa.s[s].next.len <= 2
if eNfa.s[s].kind == reEoe:
return (ps, s)
if eNfa.s[s].kind == reGroupEnd:
if gSymCount == 0:
return (ps, s)
dec gSymCount
if eNfa.s[s].kind == reGroupStart:
inc gSymCount
ps = s
if eNfa.s[s].next.len == 2: # xxx fix greedy
s = eNfa.s[s].next[0]
else:
s = eNfa.s[s].next[0]

func endOfOr(eNfa: Enfa, s: int16): int16 =
doAssert eNfa.s[s].kind == reOr
let (_, tend) = eNfa.lastOf(s)
return tend

func term(eNfa: Enfa, s: int16): int16 =
doAssert eNfa.s[s].kind == reOr
return eNfa.s[s].next[0]

func term2(eNfa: Enfa, s: int16): int16 =
## grab second term if not OR
doAssert eNfa.s[s].kind == reOr
let ss = eNfa.s[s].next[1]
if eNfa.s[ss].kind != reOr:
return ss
return eNfa.s[s].next[0]

func eatOneState(eNfa: var Enfa, s: int16) =
doAssert eNfa.s[s].kind == reOr
let ss = eNfa.s[s].next[0]
doAssert eNfa.s[ss].kind == reChar
doAssert eNfa.s[ss].next.len == 1
eNfa.s[s].next[0] = eNfa.s[ss].next[0]

func eatOneState2(eNfa: var Enfa, s: int16) =
doAssert eNfa.s[s].kind == reOr
let ss = eNfa.s[s].next[1]
if eNfa.s[ss].kind == reOr:
eatOneState(eNfa, s)
return
doAssert eNfa.s[ss].kind == reChar
doAssert eNfa.s[ss].next.len == 1
eNfa.s[s].next[1] = eNfa.s[ss].next[0]

func altOpt2(
eNfa: var Enfa,
state: int16,
repStates: var Table[int16, int16]
) =
var s = state
if eNfa.s[s].kind == reOr:
var ors = @[s]
while eNfa.goesToOr(s):
s = eNfa.nextOr(s)
ors.add s
ors.add s
#debugEcho ors
s = eNfa.endOfOr(s)
#debugEcho repr(eNfa.s[s])
doAssert eNfa.s[s].kind != reOr
var i = 0
while i < ors.len-1:
var t = enfa.term(ors[i])
if enfa.s[t].kind != reChar:
inc i
continue
#debugEcho repr(enfa.s[t])
var i2 = i
let cp = enfa.s[t].cp
while i2 < ors.len-1:
var t2 = enfa.term2(ors[i2+1])
if enfa.s[t2].kind != reChar: break
if enfa.s[t2].cp != cp: break
inc i2
#debugEcho i2
if i2 > i:
#debugEcho "OR transformation"
enfa.s.add toCharNode(cp)
let cpIdx = (enfa.s.len-1).int16
enfa.s.add initGroupStart(isCapturing = false)
let gsIdx = (enfa.s.len-1).int16
repStates[ors[i]] = cpIdx
enfa.s[gsIdx].next.add ors[i]
enfa.s[cpIdx].next.add gsIdx
enfa.s.add Node(kind: reGroupEnd, cp: ')'.ord.Rune)
let geIdx = (enfa.s.len-1).int16
let (tlast, tend) = eNfa.lastOf enfa.term2(ors[i2])
#debugEcho repr(enfa.s[tlast])
#debugEcho repr(enfa.s[tend])
enfa.s[geIdx].next.add tend
for ii in 0 .. enfa.s[tlast].next.len-1:
if enfa.s[tlast].next[ii] == tend:
enfa.s[tlast].next[ii] = geIdx
for i3 in i .. i2-1:
enfa.eatOneState(ors[i3])
enfa.eatOneState2(ors[i2])
if eNfa.goesToOr(ors[i2]):
enfa.s[ors[i2-1]].next[1] = enfa.s[ors[i2]].next[0]
enfa.s[ors[i2]].next[0] = cpIdx
repStates[ors[i]] = ors[i2]
i = max(i2+1, i+1)
#break
doAssert eNfa.s[s].kind != reOr
if eNfa.s[s].kind in repetitionKind:
doAssert eNfa.s[s].next.len == 2
altOpt2(eNfa, eNfa.s[s].next[0], repStates) # xxx fix greedy
else:
doAssert eNfa.s[s].next.len <= 1
for ss in eNfa.s[s].next:
altOpt2(eNfa, ss, repStates)

proc altOpt2(eNfa: Enfa): Enfa =
#debugEcho "altOpt2"
result = eNfa
let start = int16(eNfa.s.len-1)
var repStates = initTable[int16, int16]()
altOpt2(result, start, repStates)
doAssert eNfa.s[start].kind == reSkip
for s in 0 .. start:
for i in 0 .. result.s[s].next.len-1:
let ss = result.s[s].next[i]
if ss in repStates:
result.s[s].next[i] = repStates[ss]
result.s.add result.s[start]
#debugEcho repr(result.s[result.s.len-1])
#debugEcho repr(result.s[10])
#debugEcho repr(result.s[11])

when isMainModule:
import ./parser
import ./exptransformation
import ./nfa

func altopt(s: string): Nfa =
let flags: RegexFlags = {}
var groups: GroupsCapture
let rpn = s
.parse(flags)
.transformExp(groups, flags)
result = rpn
.subExps
.eNfa
.altOpt2
.eRemoval

func toNfa(s: string): Nfa =
let flags: RegexFlags = {}
var groups: GroupsCapture
result = s
.parse(flags)
.transformExp(groups, flags)
.nfa2

proc toString(
nfa: Nfa,
nIdx: int16,
visited: var set[int16]
): string =
# XXX zero-match transitions are missing
if nfa.s[nIdx].kind == reEoe:
result = "eoe"
return
if nIdx in visited:
result = "[...]"
return
visited.incl nIdx
let n = nfa.s[nIdx]
result = "["
result.add $n.cp
for nn in n.next:
if isEpsilonTransition(nfa.s[nn]):
continue
result.add ", "
result.add toString(nfa, nn, visited)
result.add "]"

proc toString(nfa: Nfa): string {.used.} =
var visited: set[int16]
result = toString(nfa, 0, visited)

doAssert r"abc".altopt.toString == r"abc".toNfa.toString
doAssert r"a|b".altopt.toString == r"a|b".toNfa.toString
doAssert r"ab|ab|ab".altopt.toString == r"a(b|b|b)".toNfa.toString
doAssert r"ab|ab".altopt.toString == r"a(b|b)".toNfa.toString
doAssert r"ab|ab|bc|bc".altopt.toString == r"a(b|b)|b(c|c)".toNfa.toString
doAssert r"ab|ab|bc|bc".altopt.toString != r"a(b|b|b(c|c))".toNfa.toString
echo "ok altopt.nim"
9 changes: 6 additions & 3 deletions src/regex/common.nim
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,16 @@ const
# Nim and not the actual `\n`
lineBreakRune* = Rune(10)

proc toRune*(s: string): Rune =
func toRune*(s: string): Rune {.inline.} =
result = s.runeAt(0)

proc `<=`*(x, y: Rune): bool =
func toRune*(c: char): Rune {.inline.} =
result = c.ord.Rune

func `<=`*(x, y: Rune): bool {.inline.} =
x.int <= y.int

proc cmp*(x, y: Rune): int =
func cmp*(x, y: Rune): int =
x.int - y.int

func bwRuneAt*(s: string, n: int): Rune =
Expand Down
20 changes: 12 additions & 8 deletions src/regex/types.nim
Original file line number Diff line number Diff line change
Expand Up @@ -136,22 +136,23 @@ func initJoinerNode*(): Node =
## Joiners are temporary nodes,
## they serve to generate the NFA
## but they are never part of it
Node(kind: reJoiner, cp: "~".toRune)
Node(kind: reJoiner, cp: '~'.toRune)

func initEoeNode*(): Node =
## return the end-of-expression ``Node``.
## This is a dummy node that marks a match as successful
Node(kind: reEoe, cp: "#".toRune)
Node(kind: reEoe, cp: '#'.toRune)

template initSetNodeImpl(result: var Node, k: NodeKind) =
## base node
assert k in {reInSet, reNotSet}
result = Node(
kind: k,
cp: "#".toRune,
cp: '#'.toRune,
cps: initHashSet[Rune](2),
ranges: @[],
shorthands: @[])
shorthands: @[]
)

func initSetNode*(): Node =
## return a set ``Node``,
Expand All @@ -171,30 +172,33 @@ func initGroupStart*(
## return a ``reGroupStart`` node
Node(
kind: reGroupStart,
cp: "(".toRune,
cp: '('.toRune,
name: name,
flags: flags,
isCapturing: isCapturing)

func initSkipNode*(): Node =
result = Node(
kind: reSkip,
cp: "#".toRune)
cp: '#'.toRune)

func initSkipNode*(next: openArray[int16]): Node =
## Return a dummy node that should be skipped
## while traversing the NFA
result = Node(
kind: reSkip,
cp: "#".toRune, next: toSeq(next))
cp: '#'.toRune,
next: toSeq(next)
)

func isEmpty*(n: Node): bool =
## check if a set ``Node`` is empty
assert n.kind in {reInSet, reNotSet}
result = (
n.cps.len == 0 and
n.ranges.len == 0 and
n.shorthands.len == 0)
n.shorthands.len == 0
)

const
opKind* = {
Expand Down
Loading