From 9ccc25f24a8533d783c95b2340d5bafadc008fd6 Mon Sep 17 00:00:00 2001 From: Esteban C Borsani Date: Mon, 30 Dec 2024 20:17:05 -0300 Subject: [PATCH] Fix casefold (#150) --- .gitignore | 1 + regex.nimble | 2 +- src/regex/exptransformation.nim | 13 +++++++------ src/regex/nfamacro.nim | 4 +++- src/regex/nodematch.nim | 11 ++--------- tests/tests_misc.nim | 23 +++++++++++++++++++++++ 6 files changed, 37 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index 5abb110f..a66bc216 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ tests/test_bug docs/ugh bin/* bench/bench +config.nims diff --git a/regex.nimble b/regex.nimble index e21dffe4..efbf5467 100644 --- a/regex.nimble +++ b/regex.nimble @@ -8,7 +8,7 @@ srcDir = "src" skipDirs = @["tests", "bench", "docs"] requires "nim >= 1.6.0" -requires "unicodedb >= 0.7.2" +requires "unicodedb >= 0.13.1" template execTest(lang, target: static string) = doAssert lang in ["c", "js"] diff --git a/src/regex/exptransformation.nim b/src/regex/exptransformation.nim index 25f5cf95..060965cc 100644 --- a/src/regex/exptransformation.nim +++ b/src/regex/exptransformation.nim @@ -3,13 +3,13 @@ import std/sets import std/tables import std/algorithm +import pkg/unicodedb/casing + import ./exptype import ./types import ./common import ./scanner -# todo: can not use unicodeplus due to -# https://github.com/nim-lang/Nim/issues/7059 func swapCase(r: Rune): Rune = # Note a character can be # non-lower and non-upper @@ -178,10 +178,12 @@ func applyFlag(n: var Node, f: Flag) = else: discard of flagCaseInsensitive: - if n.kind == reChar and n.cp != n.cp.swapCase(): + if n.kind == reChar and n.cp.hasCaseFolds: n.kind = reCharCI + n.cp = n.cp.simpleCaseFold # todo: apply recursevely to # shorthands of reInSet/reNotSet (i.e: [:ascii:]) + # XXX add all casefolds that map to the cp instead of swapCase if n.kind in {reInSet, reNotSet}: var cps = newSeq[Rune]() for cp in items n.cps: @@ -190,9 +192,8 @@ func applyFlag(n: var Node, f: Flag) = cps.add cp2 n.cps.add cps for sl in n.ranges[0 .. ^1]: - let - cpa = sl.a.swapCase() - cpb = sl.b.swapCase() + let cpa = sl.a.swapCase() + let cpb = sl.b.swapCase() if sl.a != cpa and sl.b != cpb: n.ranges.add(cpa .. cpb) of flagUnGreedy: diff --git a/src/regex/nfamacro.nim b/src/regex/nfamacro.nim index f31fc839..5af2fb84 100644 --- a/src/regex/nfamacro.nim +++ b/src/regex/nfamacro.nim @@ -6,6 +6,7 @@ import std/tables import std/sets import std/algorithm +import pkg/unicodedb/casing import pkg/unicodedb/properties import pkg/unicodedb/types as utypes @@ -124,7 +125,8 @@ func genMatch(c: NimNode, n: Node): NimNode = quote do: true of reCharCI: let cp2Lit = newLit n.cp.swapCase().int32 - quote do: `c` == `cpLit` or `c` == `cp2Lit` + let cp3Lit = newLit n.cp.simpleCaseFold().int32 + quote do: `c` == `cpLit` or `c` == `cp2Lit` or simpleCaseFold(`c`) == Rune(`cp3Lit`) of reWordAscii: genWordAsciiMatch(c) of reNotAlphaNumAscii: diff --git a/src/regex/nodematch.nim b/src/regex/nodematch.nim index b5a42ec4..78228935 100644 --- a/src/regex/nodematch.nim +++ b/src/regex/nodematch.nim @@ -1,5 +1,6 @@ import std/unicode except `==` +import pkg/unicodedb/casing import pkg/unicodedb/properties import pkg/unicodedb/types as utypes @@ -97,14 +98,6 @@ func isDigitAscii(r: Rune): bool {.inline.} = else: false -# todo: can not use unicodeplus due to -# https://github.com/nim-lang/Nim/issues/7059 -func swapCase*(r: Rune): Rune = - result = r.toLower() - if result != r: - return - result = r.toUpper() - func matchAsciiSet(n: Node, r: Rune): bool = assert n.shorthands.len == 0 result = r in n.cps or @@ -162,7 +155,7 @@ func match*(n: Node, r: Rune): bool {.inline.} = of reNotWhiteSpace: not r.isWhiteSpace() of reAny: r != lineBreakRune of reAnyNL: true - of reCharCI: r == n.cp or r == n.cp.swapCase() + of reCharCI: r == n.cp or n.cp == r.simpleCaseFold of reUCC: r.unicodeCategory() in n.cc of reNotUCC: r.unicodeCategory() notin n.cc of reWordAscii: r.isWordAscii() diff --git a/tests/tests_misc.nim b/tests/tests_misc.nim index 8dd536d2..e0d8989e 100644 --- a/tests/tests_misc.nim +++ b/tests/tests_misc.nim @@ -70,6 +70,7 @@ func findAllCapt(s: string, reg: Regex2): seq[seq[Slice[int]]] = result = map( findAll(s, reg), func (m: RegexMatch2): seq[Slice[int]] = + result = newSeq[Slice[int]]() for i in 0 .. m.groupsCount-1: result.add m.group(i)) @@ -696,3 +697,25 @@ test "rust_regression": check findAllBounds(r"hiya \N{snowman} bye", re2"(\\N\{[^}]+})|([{}])") == @[5 .. 15] check findAllCapt(r"hiya \N{snowman} bye", re2"(\\N\{[^}]+})|([{}])") == @[@[5 .. 15, nonCapture]] + +# https://github.com/BurntSushi/rebar/pull/20 +test "rebar": + block: + check match("ſ", re2(r"s", {regexCaseless})) + check match("s", re2(r"ſ", {regexCaseless})) + check match("ſ", re2(r"S", {regexCaseless})) + check match("S", re2(r"ſ", {regexCaseless})) + check "ſ".len == 2 + check findAllBounds("ſ", re2(r"s", {regexCaseless})) == @[0 .. 1] + check findAllBounds("s", re2(r"ſ", {regexCaseless})) == @[0 .. 0] + check findAllBounds("ſ", re2(r"S", {regexCaseless})) == @[0 .. 1] + check findAllBounds("S", re2(r"ſ", {regexCaseless})) == @[0 .. 0] + # XXX fix + #check match("s", re2(r"[ſ]", {regexCaseless})) + #check match("ſ", re2(r"[s]", {regexCaseless})) + check match("a", re2(r"A", {regexCaseless})) + check match("A", re2(r"a", {regexCaseless})) + check match("@", re2(r"@", {regexCaseless})) + check findAllBounds("a", re2(r"A", {regexCaseless})) == @[0 .. 0] + check findAllBounds("A", re2(r"a", {regexCaseless})) == @[0 .. 0] + check findAllBounds("@", re2(r"@", {regexCaseless})) == @[0 .. 0]