nitely · nitely · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@ src/regex.js
 src/regex.out
 src/regex/litopt
 src/regex/nfatype
+src/regex/common
 tests/tests
 tests/tests.js
 tests/tests2

diff --git a/regex.nimble b/regex.nimble
@@ -20,6 +20,7 @@ task test2, "Test":
   execTest "c", "-r -o:bin/regex src/regex.nim"
   execTest "c", "-r -o:bin/litopt src/regex/litopt.nim"
   execTest "c", "-r -o:bin/nfatype src/regex/nfatype.nim"
+  execTest "c", "-r -o:bin/common src/regex/common.nim"
   execTest "c", "-r tests/tests2.nim"
   execTest "c", "-r -d:forceRegexAtRuntime tests/tests2.nim"
   execTest "c", "-r -d:forceRegexAtRuntime -d:noRegexOpt tests/tests2.nim"

diff --git a/src/regex/common.nim b/src/regex/common.nim
@@ -1,5 +1,6 @@
 import std/unicode
 import std/strutils
+import std/algorithm
 
 type
   RegexError* = object of ValueError
@@ -23,10 +24,10 @@ func toRune*(c: char): Rune =
   result = Rune(c.ord)
 
 func `<=`*(x, y: Rune): bool =
-  x.int <= y.int
+  x.int32 <= y.int32
 
 func cmp*(x, y: Rune): int =
-  x.int - y.int
+  x.int32 - y.int32
 
 func bwRuneAt*(s: string, n: int): Rune =
   ## Take rune ending at ``n``
@@ -106,3 +107,70 @@ func verifyUtf8*(s: string): int =
     inc i
   if state == vusStart:
     result = -1
+
+type
+  SortedSeq*[T] = object
+    s: seq[T]
+
+func initSortedSeq*[T]: SortedSeq[T] {.inline.} =
+  SortedSeq[T](s: @[])
+
+func len*[T](s: SortedSeq[T]): int {.inline.} =
+  s.s.len
+
+func add*[T](s: var SortedSeq[T], x: openArray[T]) =
+  if x.len == 0:
+    return
+  s.s.add x
+  sort s.s, cmp
+
+func contains*[T](s: SortedSeq[T], x: T): bool =
+  if s.len <= 10:
+    return x in s.s
+  return binarySearch(s.s, x, cmp) != -1
+
+iterator items*[T](s: SortedSeq[T]): T {.inline.} =
+  for i in 0 .. s.s.len-1:
+    yield s.s[i]
+
+
+when isMainModule:
+  block:
+    var s = initSortedSeq[int]()
+    doAssert s.s.len == 0
+    s.add @[2,1,3]
+    doAssert s.s == @[1,2,3]
+    s.add @[5,4,6,7]
+    doAssert s.s == @[1,2,3,4,5,6,7]
+  block:
+    var s = initSortedSeq[int]()
+    doAssert s.len == 0
+    s.add @[2,1,3]
+    doAssert s.len == 3
+  block:
+    var s = initSortedSeq[int]()
+    doAssert 1 notin s
+    s.add @[2,1,3]
+    doAssert 1 in s
+    doAssert 2 in s
+    doAssert 3 in s
+    doAssert 4 notin s
+    doAssert 0 notin s
+  block:
+    var s = initSortedSeq[int]()
+    s.add @[2,1,3]
+    var ss = newSeq[int]()
+    for x in s:
+      ss.add x
+    doAssert ss == @[1,2,3]
+  block:
+    var nums = newSeq[int]()
+    for x in 100 .. 200:
+      nums.add x
+    for x in 0 .. 100:
+      nums.add x
+    var s = initSortedSeq[int]()
+    s.add nums
+    for x in 0 .. 200:
+      doAssert x in s
+  echo "ok"
diff --git a/src/regex/exptransformation.nim b/src/regex/exptransformation.nim
@@ -183,12 +183,12 @@ func applyFlag(n: var Node, f: Flag) =
     # todo: apply recursevely to
     #       shorthands of reInSet/reNotSet (i.e: [:ascii:])
     if n.kind in {reInSet, reNotSet}:
-      var cps = initHashSet[Rune](2)
-      cps.incl(n.cps)
-      for cp in cps:
-        let cpsc = cp.swapCase()
-        if cp != cpsc:
-          n.cps.incl(cpsc)
+      var cps = newSeq[Rune]()
+      for cp in items n.cps:
+        let cp2 = cp.swapCase()
+        if cp != cp2:
+          cps.add cp2
+      n.cps.add cps
       for sl in n.ranges[0 .. ^1]:
         let
           cpa = sl.a.swapCase()

diff --git a/src/regex/nodematch.nim b/src/regex/nodematch.nim
@@ -1,5 +1,4 @@
 import std/unicode except `==`
-import std/sets
 
 import pkg/unicodedb/properties
 import pkg/unicodedb/types as utypes

diff --git a/src/regex/parser.nim b/src/regex/parser.nim
@@ -2,6 +2,7 @@ import std/unicode
 import std/strutils
 import std/sets
 import std/parseutils
+import std/sequtils
 
 import pkg/unicodedb/properties
 
@@ -291,69 +292,79 @@ func parseAsciiSet(sc: Scanner[Rune]): Node =
       break
     name.add(r.toUTF8)
   prettyCheck(
-    sc.peek == ']'.toRune,
-    "Invalid ascii set. Expected [:name:]")
+    sc.peek == ']'.toRune, "Invalid ascii set. Expected [:name:]"
+  )
   discard sc.next
   case name
   of "alpha":
     result.ranges.add([
       'a'.toRune .. 'z'.toRune,
-      'A'.toRune .. 'Z'.toRune])
+      'A'.toRune .. 'Z'.toRune
+    ])
   of "alnum":
     result.ranges.add([
       '0'.toRune .. '9'.toRune,
       'a'.toRune .. 'z'.toRune,
-      'A'.toRune .. 'Z'.toRune])
+      'A'.toRune .. 'Z'.toRune
+    ])
   of "ascii":
     result.ranges.add(
-      '\x00'.toRune .. '\x7F'.toRune)
+      '\x00'.toRune .. '\x7F'.toRune
+    )
   of "blank":
-    result.cps.incl(toHashSet([
-      '\t'.toRune, ' '.toRune]))
+    result.cps.add(['\t'.toRune, ' '.toRune])
   of "cntrl":
     result.ranges.add(
-      '\x00'.toRune .. '\x1F'.toRune)
-    result.cps.incl('\x7F'.toRune)
+      '\x00'.toRune .. '\x1F'.toRune
+    )
+    result.cps.add(['\x7F'.toRune])
   of "digit":
     result.ranges.add(
-      '0'.toRune .. '9'.toRune)
+      '0'.toRune .. '9'.toRune
+    )
   of "graph":
     result.ranges.add(
-      '!'.toRune .. '~'.toRune)
+      '!'.toRune .. '~'.toRune
+    )
   of "lower":
     result.ranges.add(
-      'a'.toRune .. 'z'.toRune)
+      'a'.toRune .. 'z'.toRune
+    )
   of "print":
     result.ranges.add(
-      ' '.toRune .. '~'.toRune)
+      ' '.toRune .. '~'.toRune
+    )
   of "punct":
     result.ranges.add([
       '!'.toRune .. '/'.toRune,
       ':'.toRune .. '@'.toRune,
       '['.toRune .. '`'.toRune,
-      '{'.toRune .. '~'.toRune])
+      '{'.toRune .. '~'.toRune
+    ])
   of "space":
-    result.cps.incl(toHashSet([
+    result.cps.add([
       '\t'.toRune, '\L'.toRune, '\v'.toRune,
-      '\f'.toRune, '\r'.toRune, ' '.toRune]))
+      '\f'.toRune, '\r'.toRune, ' '.toRune
+    ])
   of "upper":
-    result.ranges.add(
-      'A'.toRune .. 'Z'.toRune)
+    result.ranges.add('A'.toRune .. 'Z'.toRune)
   of "word":
     result.ranges.add([
       '0'.toRune .. '9'.toRune,
       'a'.toRune .. 'z'.toRune,
-      'A'.toRune .. 'Z'.toRune])
-    result.cps.incl('_'.toRune)
+      'A'.toRune .. 'Z'.toRune
+    ])
+    result.cps.add(['_'.toRune])
   of "xdigit":
     result.ranges.add([
       '0'.toRune .. '9'.toRune,
       'a'.toRune .. 'f'.toRune,
-      'A'.toRune .. 'F'.toRune])
+      'A'.toRune .. 'F'.toRune
+    ])
   else:
     prettyCheck(
-      false,
-      "Invalid ascii set. `$#` is not a valid name" %% name)
+      false, "Invalid ascii set. `$#` is not a valid name" %% name
+    )
 
 func parseSet(sc: Scanner[Rune]): Node =
   ## parse a set atom (i.e ``[a-z]``) into a
@@ -430,11 +441,10 @@ func parseSet(sc: Scanner[Rune]): Node =
         cps.add(cp)
     else:
       cps.add(cp)
-  # todo: use ref and set to nil when empty
-  result.cps.incl(cps.toHashSet)
+  result.cps.add toSeq(cps.toHashSet)
   prettyCheck(
-    hasEnd,
-    "Invalid set. Missing `]`")
+    hasEnd, "Invalid set. Missing `]`"
+  )
 
 func noRepeatCheck(sc: Scanner[Rune]) =
   ## Check next symbol is not a repetition

diff --git a/src/regex/types.nim b/src/regex/types.nim
@@ -2,8 +2,6 @@
 {.used.}
 
 import std/unicode
-import std/sets
-from std/algorithm import sorted
 from std/sequtils import toSeq
 
 import pkg/unicodedb/properties
@@ -112,7 +110,7 @@ type
     # reRepRange
     min*, max*: int16
     # reInSet, reNotSet
-    cps*: HashSet[Rune]
+    cps*: SortedSeq[Rune]
     ranges*: seq[Slice[Rune]]  # todo: interval tree
     shorthands*: seq[Node]
     # reUCC, reNotUCC
@@ -148,9 +146,10 @@ template initSetNodeImpl(result: var Node, k: NodeKind) =
   result = Node(
     kind: k,
     cp: '#'.toRune,
-    cps: initHashSet[Rune](2),
+    cps: initSortedSeq[Rune](),
     ranges: @[],
-    shorthands: @[])
+    shorthands: @[]
+  )
 
 func initSetNode*(): Node =
   ## return a set ``Node``,
@@ -193,7 +192,8 @@ func isEmpty*(n: Node): bool =
   result = (
     n.cps.len == 0 and
     n.ranges.len == 0 and
-    n.shorthands.len == 0)
+    n.shorthands.len == 0
+  )
 
 const
   opKind* = {
@@ -317,13 +317,7 @@ func `$`*(n: Node): string =
     str.add '['
     if n.kind == reNotSet:
       str.add '^'
-    var
-      cps = newSeq[Rune](n.cps.len)
-      i = 0
     for cp in n.cps:
-      cps[i] = cp
-      inc i
-    for cp in cps.sorted(cmp):
       str.add $cp
     for sl in n.ranges:
       str.add($sl.a & '-' & $sl.b)

diff --git a/tests/tests2.nim b/tests/tests2.nim
@@ -786,6 +786,8 @@ test "tset":
   check "b".isMatch(re2"[abc]")
   check "c".isMatch(re2"[abc]")
   check(not "d".isMatch(re2"[abc]"))
+  check not "[".isMatch(re2"[abc]")
+  check not "]".isMatch(re2"[abc]")
   check "a".isMatch(re2"[\w]")
   check "1".isMatch(re2"[\w]")
   check "1".isMatch(re2"[\d]")
@@ -915,6 +917,20 @@ test "tset":
   check "a".isMatch(re2"[\x{61}]")
   check "abab".isMatch(re2"[\x61-\x62]*")
   check "a".isMatch(re2"[\141]")
+  check "a".isMatch(re2"[21a-z76]")
+  check "x".isMatch(re2"[21a-z76]")
+  check "z".isMatch(re2"[21a-z76]")
+  check "1".isMatch(re2"[21a-z76]")
+  check "2".isMatch(re2"[21a-z76]")
+  check "6".isMatch(re2"[21a-z76]")
+  check "7".isMatch(re2"[21a-z76]")
+  block:
+    const s = "qwertyuiopasdfghjklzxcvbnm"
+    const exp = re2("[" & s & "]")
+    var matched = 0
+    for c in s:
+      matched += int(isMatch($c, exp))
+    doAssert matched == s.len
 
 test "tnot_set":
   check "a".matchWithCapt(re2"([^b])") == @["a"]