Skip to content

Commit

Permalink
Fix casefold (#150)
Browse files Browse the repository at this point in the history
  • Loading branch information
nitely authored Dec 30, 2024
1 parent d4e6b73 commit 9ccc25f
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ tests/test_bug
docs/ugh
bin/*
bench/bench
config.nims
2 changes: 1 addition & 1 deletion regex.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ srcDir = "src"
skipDirs = @["tests", "bench", "docs"]

requires "nim >= 1.6.0"
requires "unicodedb >= 0.7.2"
requires "unicodedb >= 0.13.1"

template execTest(lang, target: static string) =
doAssert lang in ["c", "js"]
Expand Down
13 changes: 7 additions & 6 deletions src/regex/exptransformation.nim
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ import std/sets
import std/tables
import std/algorithm

import pkg/unicodedb/casing

import ./exptype
import ./types
import ./common
import ./scanner

# todo: can not use unicodeplus due to
# https://github.com/nim-lang/Nim/issues/7059
func swapCase(r: Rune): Rune =
# Note a character can be
# non-lower and non-upper
Expand Down Expand Up @@ -178,10 +178,12 @@ func applyFlag(n: var Node, f: Flag) =
else:
discard
of flagCaseInsensitive:
if n.kind == reChar and n.cp != n.cp.swapCase():
if n.kind == reChar and n.cp.hasCaseFolds:
n.kind = reCharCI
n.cp = n.cp.simpleCaseFold
# todo: apply recursevely to
# shorthands of reInSet/reNotSet (i.e: [:ascii:])
# XXX add all casefolds that map to the cp instead of swapCase
if n.kind in {reInSet, reNotSet}:
var cps = newSeq[Rune]()
for cp in items n.cps:
Expand All @@ -190,9 +192,8 @@ func applyFlag(n: var Node, f: Flag) =
cps.add cp2
n.cps.add cps
for sl in n.ranges[0 .. ^1]:
let
cpa = sl.a.swapCase()
cpb = sl.b.swapCase()
let cpa = sl.a.swapCase()
let cpb = sl.b.swapCase()
if sl.a != cpa and sl.b != cpb:
n.ranges.add(cpa .. cpb)
of flagUnGreedy:
Expand Down
4 changes: 3 additions & 1 deletion src/regex/nfamacro.nim
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import std/tables
import std/sets
import std/algorithm

import pkg/unicodedb/casing
import pkg/unicodedb/properties
import pkg/unicodedb/types as utypes

Expand Down Expand Up @@ -124,7 +125,8 @@ func genMatch(c: NimNode, n: Node): NimNode =
quote do: true
of reCharCI:
let cp2Lit = newLit n.cp.swapCase().int32
quote do: `c` == `cpLit` or `c` == `cp2Lit`
let cp3Lit = newLit n.cp.simpleCaseFold().int32
quote do: `c` == `cpLit` or `c` == `cp2Lit` or simpleCaseFold(`c`) == Rune(`cp3Lit`)
of reWordAscii:
genWordAsciiMatch(c)
of reNotAlphaNumAscii:
Expand Down
11 changes: 2 additions & 9 deletions src/regex/nodematch.nim
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import std/unicode except `==`

import pkg/unicodedb/casing
import pkg/unicodedb/properties
import pkg/unicodedb/types as utypes

Expand Down Expand Up @@ -97,14 +98,6 @@ func isDigitAscii(r: Rune): bool {.inline.} =
else:
false

# todo: can not use unicodeplus due to
# https://github.com/nim-lang/Nim/issues/7059
func swapCase*(r: Rune): Rune =
result = r.toLower()
if result != r:
return
result = r.toUpper()

func matchAsciiSet(n: Node, r: Rune): bool =
assert n.shorthands.len == 0
result = r in n.cps or
Expand Down Expand Up @@ -162,7 +155,7 @@ func match*(n: Node, r: Rune): bool {.inline.} =
of reNotWhiteSpace: not r.isWhiteSpace()
of reAny: r != lineBreakRune
of reAnyNL: true
of reCharCI: r == n.cp or r == n.cp.swapCase()
of reCharCI: r == n.cp or n.cp == r.simpleCaseFold
of reUCC: r.unicodeCategory() in n.cc
of reNotUCC: r.unicodeCategory() notin n.cc
of reWordAscii: r.isWordAscii()
Expand Down
23 changes: 23 additions & 0 deletions tests/tests_misc.nim
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ func findAllCapt(s: string, reg: Regex2): seq[seq[Slice[int]]] =
result = map(
findAll(s, reg),
func (m: RegexMatch2): seq[Slice[int]] =
result = newSeq[Slice[int]]()
for i in 0 .. m.groupsCount-1:
result.add m.group(i))

Expand Down Expand Up @@ -696,3 +697,25 @@ test "rust_regression":
check findAllBounds(r"hiya \N{snowman} bye", re2"(\\N\{[^}]+})|([{}])") == @[5 .. 15]
check findAllCapt(r"hiya \N{snowman} bye", re2"(\\N\{[^}]+})|([{}])") ==
@[@[5 .. 15, nonCapture]]

# https://github.com/BurntSushi/rebar/pull/20
test "rebar":
block:
check match("ſ", re2(r"s", {regexCaseless}))
check match("s", re2(r"ſ", {regexCaseless}))
check match("ſ", re2(r"S", {regexCaseless}))
check match("S", re2(r"ſ", {regexCaseless}))
check "ſ".len == 2
check findAllBounds("ſ", re2(r"s", {regexCaseless})) == @[0 .. 1]
check findAllBounds("s", re2(r"ſ", {regexCaseless})) == @[0 .. 0]
check findAllBounds("ſ", re2(r"S", {regexCaseless})) == @[0 .. 1]
check findAllBounds("S", re2(r"ſ", {regexCaseless})) == @[0 .. 0]
# XXX fix
#check match("s", re2(r"[ſ]", {regexCaseless}))
#check match("ſ", re2(r"[s]", {regexCaseless}))
check match("a", re2(r"A", {regexCaseless}))
check match("A", re2(r"a", {regexCaseless}))
check match("@", re2(r"@", {regexCaseless}))
check findAllBounds("a", re2(r"A", {regexCaseless})) == @[0 .. 0]
check findAllBounds("A", re2(r"a", {regexCaseless})) == @[0 .. 0]
check findAllBounds("@", re2(r"@", {regexCaseless})) == @[0 .. 0]

0 comments on commit 9ccc25f

Please sign in to comment.