Skip to content

Commit

Permalink
match set node rework (#147)
Browse files Browse the repository at this point in the history
  • Loading branch information
nitely authored Dec 8, 2024
1 parent 98a6e5a commit e6bae73
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 54 deletions.
104 changes: 58 additions & 46 deletions src/regex/nodematch.nim
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,45 @@ func swapCase*(r: Rune): Rune =
return
result = r.toUpper()

func matchAsciiSet(n: Node, r: Rune): bool =
assert n.shorthands.len == 0
result = r in n.cps or
r in n.ranges
result = (result and n.kind == reInSet) or
(not result and n.kind == reNotSet)

func matchShorthand(n: Node, r: Rune): bool =
case n.kind
of reWord: r.isWord()
of reNotAlphaNum: not r.isWord()
of reDigit: r.isDecimal()
of reNotDigit: not r.isDecimal()
of reWhiteSpace: r.isWhiteSpace()
of reNotWhiteSpace: not r.isWhiteSpace()
of reUCC: r.unicodeCategory() in n.cc
of reNotUCC: r.unicodeCategory() notin n.cc
of reWordAscii: r.isWordAscii()
of reNotAlphaNumAscii: not r.isWordAscii()
of reDigitAscii: r.isDigitAscii()
of reNotDigitAscii: not r.isDigitAscii()
of reWhiteSpaceAscii: r.isWhiteSpaceAscii()
of reNotWhiteSpaceAscii: not r.isWhiteSpaceAscii()
of reInSet, reNotSet: matchAsciiSet(n, r)
else:
doAssert false
false

func matchSet(n: Node, r: Rune): bool =
result = r in n.cps or
r in n.ranges
if not result:
for nn in n.shorthands:
result = matchShorthand(nn, r)
if result:
break
result = (result and n.kind == reInSet) or
(not result and n.kind == reNotSet)

func match*(n: Node, r: Rune): bool {.inline.} =
## match for ``Node`` of matchable kind.
## Return whether the node matches
Expand All @@ -115,52 +154,25 @@ func match*(n: Node, r: Rune): bool {.inline.} =
if n.kind == reChar:
return n.cp == r
case n.kind
of reEOE:
r == invalidRune
of reWord:
r.isWord()
of reNotAlphaNum:
not r.isWord()
of reDigit:
r.isDecimal()
of reNotDigit:
not r.isDecimal()
of reWhiteSpace:
r.isWhiteSpace()
of reNotWhiteSpace:
not r.isWhiteSpace()
of reInSet, reNotSet:
var matches = (
r in n.cps or
r in n.ranges)
if not matches:
for nn in n.shorthands:
matches = nn.match(r)
if matches: break
((matches and n.kind == reInSet) or
(not matches and n.kind == reNotSet))
of reAny:
r != lineBreakRune
of reAnyNL:
true
of reCharCI:
r == n.cp or r == n.cp.swapCase()
of reWordAscii:
r.isWordAscii()
of reDigitAscii:
r.isDigitAscii()
of reWhiteSpaceAscii:
r.isWhiteSpaceAscii()
of reUCC:
r.unicodeCategory() in n.cc
of reNotAlphaNumAscii:
not r.isWordAscii()
of reNotDigitAscii:
not r.isDigitAscii()
of reNotWhiteSpaceAscii:
not r.isWhiteSpaceAscii()
of reNotUCC:
r.unicodeCategory() notin n.cc
of reEOE: r == invalidRune
of reWord: r.isWord()
of reNotAlphaNum: not r.isWord()
of reDigit: r.isDecimal()
of reNotDigit: not r.isDecimal()
of reWhiteSpace: r.isWhiteSpace()
of reNotWhiteSpace: not r.isWhiteSpace()
of reAny: r != lineBreakRune
of reAnyNL: true
of reCharCI: r == n.cp or r == n.cp.swapCase()
of reUCC: r.unicodeCategory() in n.cc
of reNotUCC: r.unicodeCategory() notin n.cc
of reWordAscii: r.isWordAscii()
of reNotAlphaNumAscii: not r.isWordAscii()
of reDigitAscii: r.isDigitAscii()
of reNotDigitAscii: not r.isDigitAscii()
of reWhiteSpaceAscii: r.isWhiteSpaceAscii()
of reNotWhiteSpaceAscii: not r.isWhiteSpaceAscii()
of reInSet, reNotSet: matchSet(n, r)
else:
assert n.kind == reChar
n.cp == r
3 changes: 1 addition & 2 deletions src/regex/parser.nim
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,7 @@ func parseSetEscapedSeq(sc: Scanner[Rune]): Node =

func parseAsciiSet(sc: Scanner[Rune]): Node =
## Parse an ascii set (i.e: ``[:ascii:]``).
## The ascii set will get expanded
## and merged with the outer set
## An expanded ascii set is returned.
let startPos = sc.pos
assert sc.peek == ":".toRune
discard sc.next()
Expand Down
2 changes: 2 additions & 0 deletions tests/tests.nim
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ proc raises(pattern: string): bool =
result = true

proc raisesMsg(pattern: string): string =
result = ""
try:
discard pattern.re()
except RegexError:
Expand Down Expand Up @@ -71,6 +72,7 @@ func findAllCapt(s: string, reg: Regex): seq[seq[seq[Slice[int]]]] =
result = map(
findAll(s, reg),
func (m: RegexMatch): seq[seq[Slice[int]]] =
result = newSeq[seq[Slice[int]]]()
for i in 0 .. m.groupsCount-1:
result.add m.group(i))

Expand Down
15 changes: 9 additions & 6 deletions tests/tests2.nim
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ proc raises(pattern: string): bool =
result = true

proc raisesMsg(pattern: string): string =
result = ""
try:
discard pattern.re2()
except RegexError:
Expand All @@ -52,7 +53,7 @@ proc raisesMsg(pattern: string): string =
proc matchWithCapt(s: string, pattern: static Regex2): seq[string] =
var m = RegexMatch2()
check match(s, pattern, m)
result.setLen m.captures.len
result = newSeq[string](m.captures.len)
for i, bounds in m.captures.pairs:
result[i] = s[bounds]

Expand All @@ -62,7 +63,7 @@ proc matchWithBounds(s: string, pattern: static Regex2): seq[Slice[int]] =
return m.captures

proc toStrCaptures(m: RegexMatch2, s: string): seq[string] =
result.setLen m.captures.len
result = newSeq[string](m.captures.len)
for i, bounds in m.captures.pairs:
result[i] = s[bounds]

Expand All @@ -75,6 +76,7 @@ func findAllCapt(s: string, reg: Regex2): seq[seq[Slice[int]]] =
result = map(
findAll(s, reg),
func (m: RegexMatch2): seq[Slice[int]] =
result = newSeq[Slice[int]]()
for i in 0 .. m.groupsCount-1:
result.add m.group(i))

Expand Down Expand Up @@ -104,17 +106,18 @@ template matchMacro(s, r: untyped): untyped =

template matchMacroCapt(s, r: untyped): untyped =
(func (): seq[string] =
result = newSeq[string]()
var m = false
let exp = s
match exp, r:
m = true
result = matches
result.add matches
check m)()

test "tmatch_macro":
block hasOwnScope:
var m = false
var matches: seq[string]
var matches = newSeq[string]()
match "abc", rex"(\w+)":
check matches == @["abc"]
m = true
Expand Down Expand Up @@ -2242,7 +2245,7 @@ test "treuse_regex_match":

test "tisInitialized":
block:
var re: Regex2
var re = default(Regex2)
check(not re.isInitialized)
re = re2"foo"
check re.isInitialized
Expand Down Expand Up @@ -3092,7 +3095,7 @@ test "tverifyutf8":
raisesInvalidUtf8 endsWith("\xff", re2"abc")
raisesInvalidUtf8 replace("\xff", re2"abc", "abc")
raisesInvalidUtf8 replace("\xff", re2"abc",
(proc (m: RegexMatch2, s: string): string = discard))
(proc (m: RegexMatch2, s: string): string = return ""))
raisesInvalidUtf8 escapeRe("\xff")

# bug: raises invalid utf8 regex in Nim 1.0 + js target
Expand Down

0 comments on commit e6bae73

Please sign in to comment.