From 9ccc25f24a8533d783c95b2340d5bafadc008fd6 Mon Sep 17 00:00:00 2001
From: Esteban C Borsani <ecastroborsani@gmail.com>
Date: Mon, 30 Dec 2024 20:17:05 -0300
Subject: [PATCH] Fix casefold (#150)

---
 .gitignore                      |  1 +
 regex.nimble                    |  2 +-
 src/regex/exptransformation.nim | 13 +++++++------
 src/regex/nfamacro.nim          |  4 +++-
 src/regex/nodematch.nim         | 11 ++---------
 tests/tests_misc.nim            | 23 +++++++++++++++++++++++
 6 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5abb110f..a66bc216 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ tests/test_bug
 docs/ugh
 bin/*
 bench/bench
+config.nims
diff --git a/regex.nimble b/regex.nimble
index e21dffe4..efbf5467 100644
--- a/regex.nimble
+++ b/regex.nimble
@@ -8,7 +8,7 @@ srcDir = "src"
 skipDirs = @["tests", "bench", "docs"]
 
 requires "nim >= 1.6.0"
-requires "unicodedb >= 0.7.2"
+requires "unicodedb >= 0.13.1"
 
 template execTest(lang, target: static string) =
   doAssert lang in ["c", "js"]
diff --git a/src/regex/exptransformation.nim b/src/regex/exptransformation.nim
index 25f5cf95..060965cc 100644
--- a/src/regex/exptransformation.nim
+++ b/src/regex/exptransformation.nim
@@ -3,13 +3,13 @@ import std/sets
 import std/tables
 import std/algorithm
 
+import pkg/unicodedb/casing
+
 import ./exptype
 import ./types
 import ./common
 import ./scanner
 
-# todo: can not use unicodeplus due to
-# https://github.com/nim-lang/Nim/issues/7059
 func swapCase(r: Rune): Rune =
   # Note a character can be
   # non-lower and non-upper
@@ -178,10 +178,12 @@ func applyFlag(n: var Node, f: Flag) =
     else:
       discard
   of flagCaseInsensitive:
-    if n.kind == reChar and n.cp != n.cp.swapCase():
+    if n.kind == reChar and n.cp.hasCaseFolds:
       n.kind = reCharCI
+      n.cp = n.cp.simpleCaseFold
     # todo: apply recursevely to
     #       shorthands of reInSet/reNotSet (i.e: [:ascii:])
+    # XXX add all casefolds that map to the cp instead of swapCase
     if n.kind in {reInSet, reNotSet}:
       var cps = newSeq[Rune]()
       for cp in items n.cps:
@@ -190,9 +192,8 @@ func applyFlag(n: var Node, f: Flag) =
           cps.add cp2
       n.cps.add cps
       for sl in n.ranges[0 .. ^1]:
-        let
-          cpa = sl.a.swapCase()
-          cpb = sl.b.swapCase()
+        let cpa = sl.a.swapCase()
+        let cpb = sl.b.swapCase()
         if sl.a != cpa and sl.b != cpb:
           n.ranges.add(cpa .. cpb)
   of flagUnGreedy:
diff --git a/src/regex/nfamacro.nim b/src/regex/nfamacro.nim
index f31fc839..5af2fb84 100644
--- a/src/regex/nfamacro.nim
+++ b/src/regex/nfamacro.nim
@@ -6,6 +6,7 @@ import std/tables
 import std/sets
 import std/algorithm
 
+import pkg/unicodedb/casing
 import pkg/unicodedb/properties
 import pkg/unicodedb/types as utypes
 
@@ -124,7 +125,8 @@ func genMatch(c: NimNode, n: Node): NimNode =
       quote do: true
     of reCharCI:
       let cp2Lit = newLit n.cp.swapCase().int32
-      quote do: `c` == `cpLit` or `c` == `cp2Lit`
+      let cp3Lit = newLit n.cp.simpleCaseFold().int32
+      quote do: `c` == `cpLit` or `c` == `cp2Lit` or simpleCaseFold(`c`) == Rune(`cp3Lit`)
     of reWordAscii:
       genWordAsciiMatch(c)
     of reNotAlphaNumAscii:
diff --git a/src/regex/nodematch.nim b/src/regex/nodematch.nim
index b5a42ec4..78228935 100644
--- a/src/regex/nodematch.nim
+++ b/src/regex/nodematch.nim
@@ -1,5 +1,6 @@
 import std/unicode except `==`
 
+import pkg/unicodedb/casing
 import pkg/unicodedb/properties
 import pkg/unicodedb/types as utypes
 
@@ -97,14 +98,6 @@ func isDigitAscii(r: Rune): bool {.inline.} =
   else:
     false
 
-# todo: can not use unicodeplus due to
-# https://github.com/nim-lang/Nim/issues/7059
-func swapCase*(r: Rune): Rune =
-  result = r.toLower()
-  if result != r:
-    return
-  result = r.toUpper()
-
 func matchAsciiSet(n: Node, r: Rune): bool =
   assert n.shorthands.len == 0
   result = r in n.cps or
@@ -162,7 +155,7 @@ func match*(n: Node, r: Rune): bool {.inline.} =
   of reNotWhiteSpace: not r.isWhiteSpace()
   of reAny: r != lineBreakRune
   of reAnyNL: true
-  of reCharCI: r == n.cp or r == n.cp.swapCase()
+  of reCharCI: r == n.cp or n.cp == r.simpleCaseFold
   of reUCC: r.unicodeCategory() in n.cc
   of reNotUCC: r.unicodeCategory() notin n.cc
   of reWordAscii: r.isWordAscii()
diff --git a/tests/tests_misc.nim b/tests/tests_misc.nim
index 8dd536d2..e0d8989e 100644
--- a/tests/tests_misc.nim
+++ b/tests/tests_misc.nim
@@ -70,6 +70,7 @@ func findAllCapt(s: string, reg: Regex2): seq[seq[Slice[int]]] =
   result = map(
     findAll(s, reg),
     func (m: RegexMatch2): seq[Slice[int]] =
+      result = newSeq[Slice[int]]()
       for i in 0 .. m.groupsCount-1:
         result.add m.group(i))
 
@@ -696,3 +697,25 @@ test "rust_regression":
   check findAllBounds(r"hiya \N{snowman} bye", re2"(\\N\{[^}]+})|([{}])") == @[5 .. 15]
   check findAllCapt(r"hiya \N{snowman} bye", re2"(\\N\{[^}]+})|([{}])") ==
     @[@[5 .. 15, nonCapture]]
+
+# https://github.com/BurntSushi/rebar/pull/20
+test "rebar":
+  block:
+    check match("ſ", re2(r"s", {regexCaseless}))
+    check match("s", re2(r"ſ", {regexCaseless}))
+    check match("ſ", re2(r"S", {regexCaseless}))
+    check match("S", re2(r"ſ", {regexCaseless}))
+    check "ſ".len == 2
+    check findAllBounds("ſ", re2(r"s", {regexCaseless})) == @[0 .. 1]
+    check findAllBounds("s", re2(r"ſ", {regexCaseless})) == @[0 .. 0]
+    check findAllBounds("ſ", re2(r"S", {regexCaseless})) == @[0 .. 1]
+    check findAllBounds("S", re2(r"ſ", {regexCaseless})) == @[0 .. 0]
+    # XXX fix
+    #check match("s", re2(r"[ſ]", {regexCaseless}))
+    #check match("ſ", re2(r"[s]", {regexCaseless}))
+    check match("a", re2(r"A", {regexCaseless}))
+    check match("A", re2(r"a", {regexCaseless}))
+    check match("@", re2(r"@", {regexCaseless}))
+    check findAllBounds("a", re2(r"A", {regexCaseless})) == @[0 .. 0]
+    check findAllBounds("A", re2(r"a", {regexCaseless})) == @[0 .. 0]
+    check findAllBounds("@", re2(r"@", {regexCaseless})) == @[0 .. 0]