Skip to content

Commit

Permalink
Merge pull request #10 from alex/codepoint-indexing
Browse files Browse the repository at this point in the history
* adds support for fixed indices expressed in terms of codepoints instead of bytes

* Additional tests

* Apply suggestions from code review

Co-Authored-By: Jonathan Rudenberg <[email protected]>

* Micro-optimization

* Fixed an off-by-one error

* Further micro-optimization

* Unused variable

* Rename rawLine to rawValue

* Added benchmarks of using codepoint indices

* Added an example to the readme
  • Loading branch information
ianlopshire authored Jun 28, 2019
2 parents 5ac6eb2 + 1232f6b commit 0326432
Show file tree
Hide file tree
Showing 4 changed files with 215 additions and 29 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,14 @@ for {
}
```

If you have an input where the indices are expressed in unicode codepoints, and
not raw bytes fixedwidth supports this. Your data must be UTF-8 encoded:

```go
decoder := fixedwidth.NewDecoder(strings.NewReader(data))
decoder.SetUseCodepointIndices(true)
// Decode as usual now
```

## Licence
MIT
20 changes: 20 additions & 0 deletions bench_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,26 @@ func BenchmarkUnmarshal_MixedData_100000(b *testing.B) {
}
}

func BenchmarkDecode_CodePoints_MixedData_1_Ascii(b *testing.B) {
data := []byte(` foo foo 42 42 42 42 42 42 42 42 4.2 4.2 4.2 4.2`)
var v mixedData
for i := 0; i < b.N; i++ {
d := NewDecoder(bytes.NewReader(data))
d.SetUseCodepointIndices(true)
_ = d.Decode(&v)
}
}

func BenchmarkDecode_CodePoints_MixedData_1_UTF8(b *testing.B) {
data := []byte(` f☃☃ f☃☃ 42 42 42 42 42 42 42 42 4.2 4.2 4.2 4.2`)
var v mixedData
for i := 0; i < b.N; i++ {
d := NewDecoder(bytes.NewReader(data))
d.SetUseCodepointIndices(true)
_ = d.Decode(&v)
}
}

func BenchmarkUnmarshal_String(b *testing.B) {
data := []byte(`foo `)
var v struct {
Expand Down
131 changes: 103 additions & 28 deletions decode.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"io"
"reflect"
"strconv"
"unicode/utf8"
)

// Unmarshal parses fixed width encoded data and stores the
Expand All @@ -19,8 +20,9 @@ func Unmarshal(data []byte, v interface{}) error {

// A Decoder reads and decodes fixed width data from an input stream.
type Decoder struct {
data *bufio.Reader
done bool
data *bufio.Reader
done bool
useCodepointIndices bool
}

// NewDecoder returns a new decoder that reads from r.
Expand Down Expand Up @@ -70,6 +72,13 @@ func (e *UnmarshalTypeError) Error() string {
return s
}

// SetUseCodepointIndices configures `Decoder` on whether the indices in the
// `fixedwidth` struct tags are expressed in terms of bytes (the default
// behavior) or in terms of UTF-8 decoded codepoints.
func (d *Decoder) SetUseCodepointIndices(use bool) {
d.useCodepointIndices = use
}

// Decode reads from its input and stores the decoded data to the value
// pointed to by v.
//
Expand Down Expand Up @@ -117,6 +126,47 @@ func (d *Decoder) readLines(v reflect.Value) (err error) {
return nil
}

type rawValue struct {
bytes []byte
// Used when `SetUseCodepointIndices` has been called on `Decoder`. A
// mapping of codepoint indices into the bytes. So the
// `codepointIndices[n]` is the starting position for the n-th codepoint in
// `bytes`.
codepointIndices []int
}

func newRawValue(bytes []byte, useCodepointIndices bool) (rawValue, error) {
value := rawValue{
bytes: bytes,
}
if useCodepointIndices {
bytesIdx := 0
// Lazily allocate this only if the value actually contains a
// multi-byte character.
codepointIndices := []int(nil)
for bytesIdx < len(bytes) {
_, codepointSize := utf8.DecodeRune(bytes[bytesIdx:])
if codepointSize == 0 {
return rawValue{}, errors.New("fixedwidth: Invalid codepoint")
}
// We have a multi-byte codepoint, we need to allocate
// codepointIndices
if codepointIndices == nil && codepointSize > 1 {
codepointIndices = make([]int, bytesIdx)
for i := 0; i < bytesIdx; i++ {
codepointIndices[i] = i
}
}
if codepointIndices != nil {
codepointIndices = append(codepointIndices, bytesIdx)
}
bytesIdx += codepointSize
}
value.codepointIndices = codepointIndices
}
return value, nil
}

func (d *Decoder) readLine(v reflect.Value) (err error, ok bool) {
var line []byte
line, err = d.data.ReadBytes('\n')
Expand All @@ -131,20 +181,45 @@ func (d *Decoder) readLine(v reflect.Value) (err error, ok bool) {
return nil, false
}
}
return newValueSetter(v.Type())(v, line), true
rawValue, err := newRawValue(line, d.useCodepointIndices)
if err != nil {
return
}
return newValueSetter(v.Type())(v, rawValue), true
}

func rawValueFromLine(line []byte, startPos, endPos int) []byte {
if len(line) == 0 || startPos > len(line) {
return []byte{}
}
if endPos > len(line) {
endPos = len(line)
func rawValueFromLine(value rawValue, startPos, endPos int) rawValue {
if value.codepointIndices != nil {
if len(value.codepointIndices) == 0 || startPos > len(value.codepointIndices) {
return rawValue{bytes: []byte{}}
}
var relevantIndices []int
var lineBytes []byte
if endPos >= len(value.codepointIndices) {
relevantIndices = value.codepointIndices[startPos-1:]
lineBytes = value.bytes[relevantIndices[0]:]
} else {
relevantIndices = value.codepointIndices[startPos-1 : endPos]
lineBytes = value.bytes[relevantIndices[0]:value.codepointIndices[endPos]]
}
return rawValue{
bytes: bytes.TrimSpace(lineBytes),
codepointIndices: relevantIndices,
}
} else {
if len(value.bytes) == 0 || startPos > len(value.bytes) {
return rawValue{bytes: []byte{}}
}
if endPos > len(value.bytes) {
endPos = len(value.bytes)
}
return rawValue{
bytes: bytes.TrimSpace(value.bytes[startPos-1 : endPos]),
}
}
return bytes.TrimSpace(line[startPos-1 : endPos])
}

type valueSetter func(v reflect.Value, raw []byte) error
type valueSetter func(v reflect.Value, raw rawValue) error

var textUnmarshalerType = reflect.TypeOf(new(encoding.TextUnmarshaler)).Elem()

Expand Down Expand Up @@ -175,7 +250,7 @@ func newValueSetter(t reflect.Type) valueSetter {
return unknownSetter
}

func structSetter(v reflect.Value, raw []byte) error {
func structSetter(v reflect.Value, raw rawValue) error {
t := v.Type()
for i := 0; i < v.NumField(); i++ {
fv := v.Field(i)
Expand All @@ -190,41 +265,41 @@ func structSetter(v reflect.Value, raw []byte) error {
rawValue := rawValueFromLine(raw, startPos, endPos)
err := newValueSetter(sf.Type)(fv, rawValue)
if err != nil {
return &UnmarshalTypeError{string(rawValue), sf.Type, t.Name(), sf.Name, err}
return &UnmarshalTypeError{string(rawValue.bytes), sf.Type, t.Name(), sf.Name, err}
}
}
return nil
}

func unknownSetter(v reflect.Value, raw []byte) error {
func unknownSetter(v reflect.Value, raw rawValue) error {
return errors.New("fixedwidth: unknown type")
}

func nilSetter(v reflect.Value, _ []byte) error {
func nilSetter(v reflect.Value, _ rawValue) error {
v.Set(reflect.Zero(v.Type()))
return nil
}

func textUnmarshalerSetter(t reflect.Type, shouldAddr bool) valueSetter {
return func(v reflect.Value, raw []byte) error {
return func(v reflect.Value, raw rawValue) error {
if shouldAddr {
v = v.Addr()
}
// set to zero value if this is nil
if t.Kind() == reflect.Ptr && v.IsNil() {
v.Set(reflect.New(t.Elem()))
}
return v.Interface().(encoding.TextUnmarshaler).UnmarshalText(raw)
return v.Interface().(encoding.TextUnmarshaler).UnmarshalText(raw.bytes)
}
}

func interfaceSetter(v reflect.Value, raw []byte) error {
func interfaceSetter(v reflect.Value, raw rawValue) error {
return newValueSetter(v.Elem().Type())(v.Elem(), raw)
}

func ptrSetter(t reflect.Type) valueSetter {
return func(v reflect.Value, raw []byte) error {
if len(raw) <= 0 {
return func(v reflect.Value, raw rawValue) error {
if len(raw.bytes) <= 0 {
return nilSetter(v, raw)
}
if v.IsNil() {
Expand All @@ -234,16 +309,16 @@ func ptrSetter(t reflect.Type) valueSetter {
}
}

func stringSetter(v reflect.Value, raw []byte) error {
v.SetString(string(raw))
func stringSetter(v reflect.Value, raw rawValue) error {
v.SetString(string(raw.bytes))
return nil
}

func intSetter(v reflect.Value, raw []byte) error {
if len(raw) < 1 {
func intSetter(v reflect.Value, raw rawValue) error {
if len(raw.bytes) < 1 {
return nil
}
i, err := strconv.Atoi(string(raw))
i, err := strconv.Atoi(string(raw.bytes))
if err != nil {
return err
}
Expand All @@ -252,11 +327,11 @@ func intSetter(v reflect.Value, raw []byte) error {
}

func floatSetter(bitSize int) valueSetter {
return func(v reflect.Value, raw []byte) error {
if len(raw) < 1 {
return func(v reflect.Value, raw rawValue) error {
if len(raw.bytes) < 1 {
return nil
}
f, err := strconv.ParseFloat(string(raw), bitSize)
f, err := strconv.ParseFloat(string(raw.bytes), bitSize)
if err != nil {
return err
}
Expand Down
84 changes: 83 additions & 1 deletion decode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ func TestNewValueSetter(t *testing.T) {
// ensure we have an addressable target
var i = reflect.Indirect(reflect.New(reflect.TypeOf(tt.expected)))

err := newValueSetter(i.Type())(i, tt.raw)
err := newValueSetter(i.Type())(i, rawValue{bytes: tt.raw})
if tt.shouldErr != (err != nil) {
t.Errorf("newValueSetter(%s)() err want %v, have %v (%v)", reflect.TypeOf(tt.expected).Name(), tt.shouldErr, err != nil, err.Error())
}
Expand All @@ -222,6 +222,55 @@ func TestNewValueSetter(t *testing.T) {
}
}

func TestDecodeSetUseCodepointIndices(t *testing.T) {
type S struct {
A string `fixed:"1,5"`
B string `fixed:"6,10"`
C string `fixed:"11,15"`
}

for _, tt := range []struct {
name string
raw []byte
expected S
}{
{
name: "All ASCII characters",
raw: []byte("ABCD EFGH IJKL \n"),
expected: S{"ABCD", "EFGH", "IJKL"},
},
{
name: "Multi-byte characters",
raw: []byte("ABCD ☃☃ EFG \n"),
expected: S{"ABCD", "☃☃", "EFG"},
},
{
name: "Truncated with multi-byte characters",
raw: []byte("☃☃\n"),
expected: S{"☃☃", "", ""},
},
{
name: "Multi-byte characters",
raw: []byte("PIÑA DEFGHIJKLM"),
expected: S{"PIÑA", "DEFGH", "IJKLM"},
},
} {
t.Run(tt.name, func(t *testing.T) {
d := NewDecoder(bytes.NewReader(tt.raw))
d.SetUseCodepointIndices(true)
var s S
err := d.Decode(&s)
if err != nil {
t.Errorf("Unexpected err: %v", err)
}
if !reflect.DeepEqual(tt.expected, s) {
t.Errorf("Decode(%v) want %v, have %v", tt.raw, tt.expected, s)
}
})
}

}

// Verify the behavior of Decoder.Decode at the end of a file. See
// https://github.com/ianlopshire/go-fixedwidth/issues/6 for more details.
func TestDecode_EOF(t *testing.T) {
Expand Down Expand Up @@ -250,3 +299,36 @@ func TestDecode_EOF(t *testing.T) {
t.Errorf("Decode should have returned an EOF error. Returned: %v", err)
}
}

func TestNewRawValue(t *testing.T) {
for _, tt := range []struct {
name string
input []byte
expected []int
}{
{
name: "All ASCII",
input: []byte("ABC"),
expected: []int(nil),
},
{
name: "All multi-byte",
input: []byte("☃☃☃"),
expected: []int{0, 3, 6},
},
{
name: "Mixed",
input: []byte("abc☃☃☃123"),
expected: []int{0, 1, 2, 3, 6, 9, 12, 13, 14},
},
} {
t.Run(tt.name, func(t *testing.T) {
result, err := newRawValue(tt.input, true)
if err != nil {
t.Errorf("newRawValue(%v, true): Unexpected error", tt.input)
} else if !reflect.DeepEqual(tt.expected, result.codepointIndices) {
t.Errorf("newRawValue(%v, true): Unexpected result, expected %v got %v", tt.input, tt.expected, result.codepointIndices)
}
})
}
}

0 comments on commit 0326432

Please sign in to comment.