From e6bae738b9624432a265f6ee1e343edf03cab363 Mon Sep 17 00:00:00 2001 From: Esteban C Borsani Date: Sun, 8 Dec 2024 17:09:32 -0300 Subject: [PATCH] match set node rework (#147) --- src/regex/nodematch.nim | 104 ++++++++++++++++++++++------------------ src/regex/parser.nim | 3 +- tests/tests.nim | 2 + tests/tests2.nim | 15 +++--- 4 files changed, 70 insertions(+), 54 deletions(-) diff --git a/src/regex/nodematch.nim b/src/regex/nodematch.nim index b03bf65..9225fcc 100644 --- a/src/regex/nodematch.nim +++ b/src/regex/nodematch.nim @@ -106,6 +106,45 @@ func swapCase*(r: Rune): Rune = return result = r.toUpper() +func matchAsciiSet(n: Node, r: Rune): bool = + assert n.shorthands.len == 0 + result = r in n.cps or + r in n.ranges + result = (result and n.kind == reInSet) or + (not result and n.kind == reNotSet) + +func matchShorthand(n: Node, r: Rune): bool = + case n.kind + of reWord: r.isWord() + of reNotAlphaNum: not r.isWord() + of reDigit: r.isDecimal() + of reNotDigit: not r.isDecimal() + of reWhiteSpace: r.isWhiteSpace() + of reNotWhiteSpace: not r.isWhiteSpace() + of reUCC: r.unicodeCategory() in n.cc + of reNotUCC: r.unicodeCategory() notin n.cc + of reWordAscii: r.isWordAscii() + of reNotAlphaNumAscii: not r.isWordAscii() + of reDigitAscii: r.isDigitAscii() + of reNotDigitAscii: not r.isDigitAscii() + of reWhiteSpaceAscii: r.isWhiteSpaceAscii() + of reNotWhiteSpaceAscii: not r.isWhiteSpaceAscii() + of reInSet, reNotSet: matchAsciiSet(n, r) + else: + doAssert false + false + +func matchSet(n: Node, r: Rune): bool = + result = r in n.cps or + r in n.ranges + if not result: + for nn in n.shorthands: + result = matchShorthand(nn, r) + if result: + break + result = (result and n.kind == reInSet) or + (not result and n.kind == reNotSet) + func match*(n: Node, r: Rune): bool {.inline.} = ## match for ``Node`` of matchable kind. ## Return whether the node matches @@ -115,52 +154,25 @@ func match*(n: Node, r: Rune): bool {.inline.} = if n.kind == reChar: return n.cp == r case n.kind - of reEOE: - r == invalidRune - of reWord: - r.isWord() - of reNotAlphaNum: - not r.isWord() - of reDigit: - r.isDecimal() - of reNotDigit: - not r.isDecimal() - of reWhiteSpace: - r.isWhiteSpace() - of reNotWhiteSpace: - not r.isWhiteSpace() - of reInSet, reNotSet: - var matches = ( - r in n.cps or - r in n.ranges) - if not matches: - for nn in n.shorthands: - matches = nn.match(r) - if matches: break - ((matches and n.kind == reInSet) or - (not matches and n.kind == reNotSet)) - of reAny: - r != lineBreakRune - of reAnyNL: - true - of reCharCI: - r == n.cp or r == n.cp.swapCase() - of reWordAscii: - r.isWordAscii() - of reDigitAscii: - r.isDigitAscii() - of reWhiteSpaceAscii: - r.isWhiteSpaceAscii() - of reUCC: - r.unicodeCategory() in n.cc - of reNotAlphaNumAscii: - not r.isWordAscii() - of reNotDigitAscii: - not r.isDigitAscii() - of reNotWhiteSpaceAscii: - not r.isWhiteSpaceAscii() - of reNotUCC: - r.unicodeCategory() notin n.cc + of reEOE: r == invalidRune + of reWord: r.isWord() + of reNotAlphaNum: not r.isWord() + of reDigit: r.isDecimal() + of reNotDigit: not r.isDecimal() + of reWhiteSpace: r.isWhiteSpace() + of reNotWhiteSpace: not r.isWhiteSpace() + of reAny: r != lineBreakRune + of reAnyNL: true + of reCharCI: r == n.cp or r == n.cp.swapCase() + of reUCC: r.unicodeCategory() in n.cc + of reNotUCC: r.unicodeCategory() notin n.cc + of reWordAscii: r.isWordAscii() + of reNotAlphaNumAscii: not r.isWordAscii() + of reDigitAscii: r.isDigitAscii() + of reNotDigitAscii: not r.isDigitAscii() + of reWhiteSpaceAscii: r.isWhiteSpaceAscii() + of reNotWhiteSpaceAscii: not r.isWhiteSpaceAscii() + of reInSet, reNotSet: matchSet(n, r) else: assert n.kind == reChar n.cp == r diff --git a/src/regex/parser.nim b/src/regex/parser.nim index a8f1e34..6d8741a 100644 --- a/src/regex/parser.nim +++ b/src/regex/parser.nim @@ -275,8 +275,7 @@ func parseSetEscapedSeq(sc: Scanner[Rune]): Node = func parseAsciiSet(sc: Scanner[Rune]): Node = ## Parse an ascii set (i.e: ``[:ascii:]``). - ## The ascii set will get expanded - ## and merged with the outer set + ## An expanded ascii set is returned. let startPos = sc.pos assert sc.peek == ":".toRune discard sc.next() diff --git a/tests/tests.nim b/tests/tests.nim index 4b6fe26..d00aeb6 100644 --- a/tests/tests.nim +++ b/tests/tests.nim @@ -42,6 +42,7 @@ proc raises(pattern: string): bool = result = true proc raisesMsg(pattern: string): string = + result = "" try: discard pattern.re() except RegexError: @@ -71,6 +72,7 @@ func findAllCapt(s: string, reg: Regex): seq[seq[seq[Slice[int]]]] = result = map( findAll(s, reg), func (m: RegexMatch): seq[seq[Slice[int]]] = + result = newSeq[seq[Slice[int]]]() for i in 0 .. m.groupsCount-1: result.add m.group(i)) diff --git a/tests/tests2.nim b/tests/tests2.nim index 5c39d24..ef6a6f9 100644 --- a/tests/tests2.nim +++ b/tests/tests2.nim @@ -44,6 +44,7 @@ proc raises(pattern: string): bool = result = true proc raisesMsg(pattern: string): string = + result = "" try: discard pattern.re2() except RegexError: @@ -52,7 +53,7 @@ proc raisesMsg(pattern: string): string = proc matchWithCapt(s: string, pattern: static Regex2): seq[string] = var m = RegexMatch2() check match(s, pattern, m) - result.setLen m.captures.len + result = newSeq[string](m.captures.len) for i, bounds in m.captures.pairs: result[i] = s[bounds] @@ -62,7 +63,7 @@ proc matchWithBounds(s: string, pattern: static Regex2): seq[Slice[int]] = return m.captures proc toStrCaptures(m: RegexMatch2, s: string): seq[string] = - result.setLen m.captures.len + result = newSeq[string](m.captures.len) for i, bounds in m.captures.pairs: result[i] = s[bounds] @@ -75,6 +76,7 @@ func findAllCapt(s: string, reg: Regex2): seq[seq[Slice[int]]] = result = map( findAll(s, reg), func (m: RegexMatch2): seq[Slice[int]] = + result = newSeq[Slice[int]]() for i in 0 .. m.groupsCount-1: result.add m.group(i)) @@ -104,17 +106,18 @@ template matchMacro(s, r: untyped): untyped = template matchMacroCapt(s, r: untyped): untyped = (func (): seq[string] = + result = newSeq[string]() var m = false let exp = s match exp, r: m = true - result = matches + result.add matches check m)() test "tmatch_macro": block hasOwnScope: var m = false - var matches: seq[string] + var matches = newSeq[string]() match "abc", rex"(\w+)": check matches == @["abc"] m = true @@ -2242,7 +2245,7 @@ test "treuse_regex_match": test "tisInitialized": block: - var re: Regex2 + var re = default(Regex2) check(not re.isInitialized) re = re2"foo" check re.isInitialized @@ -3092,7 +3095,7 @@ test "tverifyutf8": raisesInvalidUtf8 endsWith("\xff", re2"abc") raisesInvalidUtf8 replace("\xff", re2"abc", "abc") raisesInvalidUtf8 replace("\xff", re2"abc", - (proc (m: RegexMatch2, s: string): string = discard)) + (proc (m: RegexMatch2, s: string): string = return "")) raisesInvalidUtf8 escapeRe("\xff") # bug: raises invalid utf8 regex in Nim 1.0 + js target