Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use seq for node cps #149

Merged
merged 2 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ src/regex.js
src/regex.out
src/regex/litopt
src/regex/nfatype
src/regex/common
tests/tests
tests/tests.js
tests/tests2
Expand Down
1 change: 1 addition & 0 deletions regex.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ task test2, "Test":
execTest "c", "-r -o:bin/regex src/regex.nim"
execTest "c", "-r -o:bin/litopt src/regex/litopt.nim"
execTest "c", "-r -o:bin/nfatype src/regex/nfatype.nim"
execTest "c", "-r -o:bin/common src/regex/common.nim"
execTest "c", "-r tests/tests2.nim"
execTest "c", "-r -d:forceRegexAtRuntime tests/tests2.nim"
execTest "c", "-r -d:forceRegexAtRuntime -d:noRegexOpt tests/tests2.nim"
Expand Down
72 changes: 70 additions & 2 deletions src/regex/common.nim
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import std/unicode
import std/strutils
import std/algorithm

type
RegexError* = object of ValueError
Expand All @@ -23,10 +24,10 @@ func toRune*(c: char): Rune =
result = Rune(c.ord)

func `<=`*(x, y: Rune): bool =
x.int <= y.int
x.int32 <= y.int32

func cmp*(x, y: Rune): int =
x.int - y.int
x.int32 - y.int32

func bwRuneAt*(s: string, n: int): Rune =
## Take rune ending at ``n``
Expand Down Expand Up @@ -106,3 +107,70 @@ func verifyUtf8*(s: string): int =
inc i
if state == vusStart:
result = -1

type
SortedSeq*[T] = object
s: seq[T]

func initSortedSeq*[T]: SortedSeq[T] {.inline.} =
SortedSeq[T](s: @[])

func len*[T](s: SortedSeq[T]): int {.inline.} =
s.s.len

func add*[T](s: var SortedSeq[T], x: openArray[T]) =
if x.len == 0:
return
s.s.add x
sort s.s, cmp

func contains*[T](s: SortedSeq[T], x: T): bool =
if s.len <= 10:
return x in s.s
return binarySearch(s.s, x, cmp) != -1

iterator items*[T](s: SortedSeq[T]): T {.inline.} =
for i in 0 .. s.s.len-1:
yield s.s[i]


when isMainModule:
block:
var s = initSortedSeq[int]()
doAssert s.s.len == 0
s.add @[2,1,3]
doAssert s.s == @[1,2,3]
s.add @[5,4,6,7]
doAssert s.s == @[1,2,3,4,5,6,7]
block:
var s = initSortedSeq[int]()
doAssert s.len == 0
s.add @[2,1,3]
doAssert s.len == 3
block:
var s = initSortedSeq[int]()
doAssert 1 notin s
s.add @[2,1,3]
doAssert 1 in s
doAssert 2 in s
doAssert 3 in s
doAssert 4 notin s
doAssert 0 notin s
block:
var s = initSortedSeq[int]()
s.add @[2,1,3]
var ss = newSeq[int]()
for x in s:
ss.add x
doAssert ss == @[1,2,3]
block:
var nums = newSeq[int]()
for x in 100 .. 200:
nums.add x
for x in 0 .. 100:
nums.add x
var s = initSortedSeq[int]()
s.add nums
for x in 0 .. 200:
doAssert x in s
echo "ok"
12 changes: 6 additions & 6 deletions src/regex/exptransformation.nim
Original file line number Diff line number Diff line change
Expand Up @@ -183,12 +183,12 @@ func applyFlag(n: var Node, f: Flag) =
# todo: apply recursevely to
# shorthands of reInSet/reNotSet (i.e: [:ascii:])
if n.kind in {reInSet, reNotSet}:
var cps = initHashSet[Rune](2)
cps.incl(n.cps)
for cp in cps:
let cpsc = cp.swapCase()
if cp != cpsc:
n.cps.incl(cpsc)
var cps = newSeq[Rune]()
for cp in items n.cps:
let cp2 = cp.swapCase()
if cp != cp2:
cps.add cp2
n.cps.add cps
for sl in n.ranges[0 .. ^1]:
let
cpa = sl.a.swapCase()
Expand Down
1 change: 0 additions & 1 deletion src/regex/nodematch.nim
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import std/unicode except `==`
import std/sets

import pkg/unicodedb/properties
import pkg/unicodedb/types as utypes
Expand Down
64 changes: 37 additions & 27 deletions src/regex/parser.nim
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import std/unicode
import std/strutils
import std/sets
import std/parseutils
import std/sequtils

import pkg/unicodedb/properties

Expand Down Expand Up @@ -291,69 +292,79 @@ func parseAsciiSet(sc: Scanner[Rune]): Node =
break
name.add(r.toUTF8)
prettyCheck(
sc.peek == ']'.toRune,
"Invalid ascii set. Expected [:name:]")
sc.peek == ']'.toRune, "Invalid ascii set. Expected [:name:]"
)
discard sc.next
case name
of "alpha":
result.ranges.add([
'a'.toRune .. 'z'.toRune,
'A'.toRune .. 'Z'.toRune])
'A'.toRune .. 'Z'.toRune
])
of "alnum":
result.ranges.add([
'0'.toRune .. '9'.toRune,
'a'.toRune .. 'z'.toRune,
'A'.toRune .. 'Z'.toRune])
'A'.toRune .. 'Z'.toRune
])
of "ascii":
result.ranges.add(
'\x00'.toRune .. '\x7F'.toRune)
'\x00'.toRune .. '\x7F'.toRune
)
of "blank":
result.cps.incl(toHashSet([
'\t'.toRune, ' '.toRune]))
result.cps.add(['\t'.toRune, ' '.toRune])
of "cntrl":
result.ranges.add(
'\x00'.toRune .. '\x1F'.toRune)
result.cps.incl('\x7F'.toRune)
'\x00'.toRune .. '\x1F'.toRune
)
result.cps.add(['\x7F'.toRune])
of "digit":
result.ranges.add(
'0'.toRune .. '9'.toRune)
'0'.toRune .. '9'.toRune
)
of "graph":
result.ranges.add(
'!'.toRune .. '~'.toRune)
'!'.toRune .. '~'.toRune
)
of "lower":
result.ranges.add(
'a'.toRune .. 'z'.toRune)
'a'.toRune .. 'z'.toRune
)
of "print":
result.ranges.add(
' '.toRune .. '~'.toRune)
' '.toRune .. '~'.toRune
)
of "punct":
result.ranges.add([
'!'.toRune .. '/'.toRune,
':'.toRune .. '@'.toRune,
'['.toRune .. '`'.toRune,
'{'.toRune .. '~'.toRune])
'{'.toRune .. '~'.toRune
])
of "space":
result.cps.incl(toHashSet([
result.cps.add([
'\t'.toRune, '\L'.toRune, '\v'.toRune,
'\f'.toRune, '\r'.toRune, ' '.toRune]))
'\f'.toRune, '\r'.toRune, ' '.toRune
])
of "upper":
result.ranges.add(
'A'.toRune .. 'Z'.toRune)
result.ranges.add('A'.toRune .. 'Z'.toRune)
of "word":
result.ranges.add([
'0'.toRune .. '9'.toRune,
'a'.toRune .. 'z'.toRune,
'A'.toRune .. 'Z'.toRune])
result.cps.incl('_'.toRune)
'A'.toRune .. 'Z'.toRune
])
result.cps.add(['_'.toRune])
of "xdigit":
result.ranges.add([
'0'.toRune .. '9'.toRune,
'a'.toRune .. 'f'.toRune,
'A'.toRune .. 'F'.toRune])
'A'.toRune .. 'F'.toRune
])
else:
prettyCheck(
false,
"Invalid ascii set. `$#` is not a valid name" %% name)
false, "Invalid ascii set. `$#` is not a valid name" %% name
)

func parseSet(sc: Scanner[Rune]): Node =
## parse a set atom (i.e ``[a-z]``) into a
Expand Down Expand Up @@ -430,11 +441,10 @@ func parseSet(sc: Scanner[Rune]): Node =
cps.add(cp)
else:
cps.add(cp)
# todo: use ref and set to nil when empty
result.cps.incl(cps.toHashSet)
result.cps.add toSeq(cps.toHashSet)
prettyCheck(
hasEnd,
"Invalid set. Missing `]`")
hasEnd, "Invalid set. Missing `]`"
)

func noRepeatCheck(sc: Scanner[Rune]) =
## Check next symbol is not a repetition
Expand Down
18 changes: 6 additions & 12 deletions src/regex/types.nim
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
{.used.}

import std/unicode
import std/sets
from std/algorithm import sorted
from std/sequtils import toSeq

import pkg/unicodedb/properties
Expand Down Expand Up @@ -112,7 +110,7 @@ type
# reRepRange
min*, max*: int16
# reInSet, reNotSet
cps*: HashSet[Rune]
cps*: SortedSeq[Rune]
ranges*: seq[Slice[Rune]] # todo: interval tree
shorthands*: seq[Node]
# reUCC, reNotUCC
Expand Down Expand Up @@ -148,9 +146,10 @@ template initSetNodeImpl(result: var Node, k: NodeKind) =
result = Node(
kind: k,
cp: '#'.toRune,
cps: initHashSet[Rune](2),
cps: initSortedSeq[Rune](),
ranges: @[],
shorthands: @[])
shorthands: @[]
)

func initSetNode*(): Node =
## return a set ``Node``,
Expand Down Expand Up @@ -193,7 +192,8 @@ func isEmpty*(n: Node): bool =
result = (
n.cps.len == 0 and
n.ranges.len == 0 and
n.shorthands.len == 0)
n.shorthands.len == 0
)

const
opKind* = {
Expand Down Expand Up @@ -317,13 +317,7 @@ func `$`*(n: Node): string =
str.add '['
if n.kind == reNotSet:
str.add '^'
var
cps = newSeq[Rune](n.cps.len)
i = 0
for cp in n.cps:
cps[i] = cp
inc i
for cp in cps.sorted(cmp):
str.add $cp
for sl in n.ranges:
str.add($sl.a & '-' & $sl.b)
Expand Down
16 changes: 16 additions & 0 deletions tests/tests2.nim
Original file line number Diff line number Diff line change
Expand Up @@ -786,6 +786,8 @@ test "tset":
check "b".isMatch(re2"[abc]")
check "c".isMatch(re2"[abc]")
check(not "d".isMatch(re2"[abc]"))
check not "[".isMatch(re2"[abc]")
check not "]".isMatch(re2"[abc]")
check "a".isMatch(re2"[\w]")
check "1".isMatch(re2"[\w]")
check "1".isMatch(re2"[\d]")
Expand Down Expand Up @@ -915,6 +917,20 @@ test "tset":
check "a".isMatch(re2"[\x{61}]")
check "abab".isMatch(re2"[\x61-\x62]*")
check "a".isMatch(re2"[\141]")
check "a".isMatch(re2"[21a-z76]")
check "x".isMatch(re2"[21a-z76]")
check "z".isMatch(re2"[21a-z76]")
check "1".isMatch(re2"[21a-z76]")
check "2".isMatch(re2"[21a-z76]")
check "6".isMatch(re2"[21a-z76]")
check "7".isMatch(re2"[21a-z76]")
block:
const s = "qwertyuiopasdfghjklzxcvbnm"
const exp = re2("[" & s & "]")
var matched = 0
for c in s:
matched += int(isMatch($c, exp))
doAssert matched == s.len

test "tnot_set":
check "a".matchWithCapt(re2"([^b])") == @["a"]
Expand Down
Loading