From e6bae738b9624432a265f6ee1e343edf03cab363 Mon Sep 17 00:00:00 2001
From: Esteban C Borsani <ecastroborsani@gmail.com>
Date: Sun, 8 Dec 2024 17:09:32 -0300
Subject: [PATCH] match set node rework (#147)

---
 src/regex/nodematch.nim | 104 ++++++++++++++++++++++------------------
 src/regex/parser.nim    |   3 +-
 tests/tests.nim         |   2 +
 tests/tests2.nim        |  15 +++---
 4 files changed, 70 insertions(+), 54 deletions(-)

diff --git a/src/regex/nodematch.nim b/src/regex/nodematch.nim
index b03bf65..9225fcc 100644
--- a/src/regex/nodematch.nim
+++ b/src/regex/nodematch.nim
@@ -106,6 +106,45 @@ func swapCase*(r: Rune): Rune =
     return
   result = r.toUpper()
 
+func matchAsciiSet(n: Node, r: Rune): bool =
+  assert n.shorthands.len == 0
+  result = r in n.cps or
+    r in n.ranges
+  result = (result and n.kind == reInSet) or
+    (not result and n.kind == reNotSet)
+
+func matchShorthand(n: Node, r: Rune): bool =
+  case n.kind
+  of reWord: r.isWord()
+  of reNotAlphaNum: not r.isWord()
+  of reDigit: r.isDecimal()
+  of reNotDigit: not r.isDecimal()
+  of reWhiteSpace: r.isWhiteSpace()
+  of reNotWhiteSpace: not r.isWhiteSpace()
+  of reUCC: r.unicodeCategory() in n.cc
+  of reNotUCC: r.unicodeCategory() notin n.cc
+  of reWordAscii: r.isWordAscii()
+  of reNotAlphaNumAscii: not r.isWordAscii()
+  of reDigitAscii: r.isDigitAscii()
+  of reNotDigitAscii: not r.isDigitAscii()
+  of reWhiteSpaceAscii: r.isWhiteSpaceAscii()
+  of reNotWhiteSpaceAscii: not r.isWhiteSpaceAscii()
+  of reInSet, reNotSet: matchAsciiSet(n, r)
+  else:
+    doAssert false
+    false
+
+func matchSet(n: Node, r: Rune): bool =
+  result = r in n.cps or
+    r in n.ranges
+  if not result:
+    for nn in n.shorthands:
+      result = matchShorthand(nn, r)
+      if result:
+        break
+  result = (result and n.kind == reInSet) or
+    (not result and n.kind == reNotSet)
+
 func match*(n: Node, r: Rune): bool {.inline.} =
   ## match for ``Node`` of matchable kind.
   ## Return whether the node matches
@@ -115,52 +154,25 @@ func match*(n: Node, r: Rune): bool {.inline.} =
   if n.kind == reChar:
     return n.cp == r
   case n.kind
-  of reEOE:
-    r == invalidRune
-  of reWord:
-    r.isWord()
-  of reNotAlphaNum:
-    not r.isWord()
-  of reDigit:
-    r.isDecimal()
-  of reNotDigit:
-    not r.isDecimal()
-  of reWhiteSpace:
-    r.isWhiteSpace()
-  of reNotWhiteSpace:
-    not r.isWhiteSpace()
-  of reInSet, reNotSet:
-    var matches = (
-      r in n.cps or
-      r in n.ranges)
-    if not matches:
-      for nn in n.shorthands:
-        matches = nn.match(r)
-        if matches: break
-    ((matches and n.kind == reInSet) or
-     (not matches and n.kind == reNotSet))
-  of reAny:
-    r != lineBreakRune
-  of reAnyNL:
-    true
-  of reCharCI:
-    r == n.cp or r == n.cp.swapCase()
-  of reWordAscii:
-    r.isWordAscii()
-  of reDigitAscii:
-    r.isDigitAscii()
-  of reWhiteSpaceAscii:
-    r.isWhiteSpaceAscii()
-  of reUCC:
-    r.unicodeCategory() in n.cc
-  of reNotAlphaNumAscii:
-    not r.isWordAscii()
-  of reNotDigitAscii:
-    not r.isDigitAscii()
-  of reNotWhiteSpaceAscii:
-    not r.isWhiteSpaceAscii()
-  of reNotUCC:
-    r.unicodeCategory() notin n.cc
+  of reEOE: r == invalidRune
+  of reWord: r.isWord()
+  of reNotAlphaNum: not r.isWord()
+  of reDigit: r.isDecimal()
+  of reNotDigit: not r.isDecimal()
+  of reWhiteSpace: r.isWhiteSpace()
+  of reNotWhiteSpace: not r.isWhiteSpace()
+  of reAny: r != lineBreakRune
+  of reAnyNL: true
+  of reCharCI: r == n.cp or r == n.cp.swapCase()
+  of reUCC: r.unicodeCategory() in n.cc
+  of reNotUCC: r.unicodeCategory() notin n.cc
+  of reWordAscii: r.isWordAscii()
+  of reNotAlphaNumAscii: not r.isWordAscii()
+  of reDigitAscii: r.isDigitAscii()
+  of reNotDigitAscii: not r.isDigitAscii()
+  of reWhiteSpaceAscii: r.isWhiteSpaceAscii()
+  of reNotWhiteSpaceAscii: not r.isWhiteSpaceAscii()
+  of reInSet, reNotSet: matchSet(n, r)
   else:
     assert n.kind == reChar
     n.cp == r
diff --git a/src/regex/parser.nim b/src/regex/parser.nim
index a8f1e34..6d8741a 100644
--- a/src/regex/parser.nim
+++ b/src/regex/parser.nim
@@ -275,8 +275,7 @@ func parseSetEscapedSeq(sc: Scanner[Rune]): Node =
 
 func parseAsciiSet(sc: Scanner[Rune]): Node =
   ## Parse an ascii set (i.e: ``[:ascii:]``).
-  ## The ascii set will get expanded
-  ## and merged with the outer set
+  ## An expanded ascii set is returned.
   let startPos = sc.pos
   assert sc.peek == ":".toRune
   discard sc.next()
diff --git a/tests/tests.nim b/tests/tests.nim
index 4b6fe26..d00aeb6 100644
--- a/tests/tests.nim
+++ b/tests/tests.nim
@@ -42,6 +42,7 @@ proc raises(pattern: string): bool =
     result = true
 
 proc raisesMsg(pattern: string): string =
+  result = ""
   try:
     discard pattern.re()
   except RegexError:
@@ -71,6 +72,7 @@ func findAllCapt(s: string, reg: Regex): seq[seq[seq[Slice[int]]]] =
   result = map(
     findAll(s, reg),
     func (m: RegexMatch): seq[seq[Slice[int]]] =
+      result = newSeq[seq[Slice[int]]]()
       for i in 0 .. m.groupsCount-1:
         result.add m.group(i))
 
diff --git a/tests/tests2.nim b/tests/tests2.nim
index 5c39d24..ef6a6f9 100644
--- a/tests/tests2.nim
+++ b/tests/tests2.nim
@@ -44,6 +44,7 @@ proc raises(pattern: string): bool =
     result = true
 
 proc raisesMsg(pattern: string): string =
+  result = ""
   try:
     discard pattern.re2()
   except RegexError:
@@ -52,7 +53,7 @@ proc raisesMsg(pattern: string): string =
 proc matchWithCapt(s: string, pattern: static Regex2): seq[string] =
   var m = RegexMatch2()
   check match(s, pattern, m)
-  result.setLen m.captures.len
+  result = newSeq[string](m.captures.len)
   for i, bounds in m.captures.pairs:
     result[i] = s[bounds]
 
@@ -62,7 +63,7 @@ proc matchWithBounds(s: string, pattern: static Regex2): seq[Slice[int]] =
   return m.captures
 
 proc toStrCaptures(m: RegexMatch2, s: string): seq[string] =
-  result.setLen m.captures.len
+  result = newSeq[string](m.captures.len)
   for i, bounds in m.captures.pairs:
     result[i] = s[bounds]
 
@@ -75,6 +76,7 @@ func findAllCapt(s: string, reg: Regex2): seq[seq[Slice[int]]] =
   result = map(
     findAll(s, reg),
     func (m: RegexMatch2): seq[Slice[int]] =
+      result = newSeq[Slice[int]]()
       for i in 0 .. m.groupsCount-1:
         result.add m.group(i))
 
@@ -104,17 +106,18 @@ template matchMacro(s, r: untyped): untyped =
 
 template matchMacroCapt(s, r: untyped): untyped =
   (func (): seq[string] =
+    result = newSeq[string]()
     var m = false
     let exp = s
     match exp, r:
       m = true
-      result = matches
+      result.add matches
     check m)()
 
 test "tmatch_macro":
   block hasOwnScope:
     var m = false
-    var matches: seq[string]
+    var matches = newSeq[string]()
     match "abc", rex"(\w+)":
       check matches == @["abc"]
       m = true
@@ -2242,7 +2245,7 @@ test "treuse_regex_match":
 
 test "tisInitialized":
   block:
-    var re: Regex2
+    var re = default(Regex2)
     check(not re.isInitialized)
     re = re2"foo"
     check re.isInitialized
@@ -3092,7 +3095,7 @@ test "tverifyutf8":
   raisesInvalidUtf8 endsWith("\xff", re2"abc")
   raisesInvalidUtf8 replace("\xff", re2"abc", "abc")
   raisesInvalidUtf8 replace("\xff", re2"abc",
-    (proc (m: RegexMatch2, s: string): string = discard))
+    (proc (m: RegexMatch2, s: string): string = return ""))
   raisesInvalidUtf8 escapeRe("\xff")
 
 # bug: raises invalid utf8 regex in Nim 1.0 + js target