-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathword.go
79 lines (68 loc) · 1.57 KB
/
word.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
package main
import (
"unicode"
"unicode/utf8"
"golang.org/x/text/transform"
)
// wordRune maps c into a reduced set of "word" characters.
// If c is a letter, it returns it in lowercase.
// If it is a digit, it returns it unchanged.
// Otherwise it returns a space.
func wordRune(c rune) rune {
switch {
case c >= 'a' && c <= 'z' || c >= '0' && c <= '9':
return c
case c >= 'A' && c <= 'Z':
return c + ('a' - 'A')
case c < 128:
return ' '
case unicode.IsDigit(c):
return c
case unicode.IsLetter(c):
return unicode.ToLower(c)
}
return ' '
}
// wordString applies wordRune to each character in s and removes extra spaces.
func wordString(s string) string {
runes := make([]rune, 0, 20)
prevRune := '\x00'
for _, c := range s {
c = wordRune(c)
if c != ' ' || prevRune != ' ' {
runes = append(runes, c)
}
prevRune = c
}
return string(runes)
}
// A wordTransformer does the same transformation as wordString, but in a
// streaming fashion.
type wordTransformer struct {
prevRune rune
}
func (t *wordTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for nSrc < len(src) {
r, n := utf8.DecodeRune(src[nSrc:])
if r == utf8.RuneError && !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
return
}
r = wordRune(r)
if r == ' ' && t.prevRune == ' ' {
nSrc += n
continue
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
return
}
t.prevRune = r
nSrc += n
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return
}
func (t *wordTransformer) Reset() {
t.prevRune = 0
}