Merge pull request #10 from alex/codepoint-indexing

* adds support for fixed indices expressed in terms of codepoints instead of bytes * Additional tests * Apply suggestions from code review Co-Authored-By: Jonathan Rudenberg <[email protected]> * Micro-optimization * Fixed an off-by-one error * Further micro-optimization * Unused variable * Rename rawLine to rawValue * Added benchmarks of using codepoint indices * Added an example to the readme
ianlopshire · Jun 28, 2019 · 0326432 · 0326432
2 parents 5ac6eb2 + 1232f6b
commit 0326432
Show file tree

Hide file tree

Showing 4 changed files with 215 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -80,5 +80,14 @@ for {
 }
 ```
 
+If you have an input where the indices are expressed in unicode codepoints, and
+not raw bytes fixedwidth supports this. Your data must be UTF-8 encoded:
+
+```go
+decoder := fixedwidth.NewDecoder(strings.NewReader(data))
+decoder.SetUseCodepointIndices(true)
+// Decode as usual now
+```
+
 ## Licence
 MIT
diff --git a/bench_test.go b/bench_test.go
@@ -51,6 +51,26 @@ func BenchmarkUnmarshal_MixedData_100000(b *testing.B) {
 	}
 }
 
+func BenchmarkDecode_CodePoints_MixedData_1_Ascii(b *testing.B) {
+	data := []byte(`       foo       foo        42        42        42        42        42        42        42        42       4.2       4.2       4.2       4.2`)
+	var v mixedData
+	for i := 0; i < b.N; i++ {
+		d := NewDecoder(bytes.NewReader(data))
+		d.SetUseCodepointIndices(true)
+		_ = d.Decode(&v)
+	}
+}
+
+func BenchmarkDecode_CodePoints_MixedData_1_UTF8(b *testing.B) {
+	data := []byte(`       f☃☃       f☃☃        42        42        42        42        42        42        42        42       4.2       4.2       4.2       4.2`)
+	var v mixedData
+	for i := 0; i < b.N; i++ {
+		d := NewDecoder(bytes.NewReader(data))
+		d.SetUseCodepointIndices(true)
+		_ = d.Decode(&v)
+	}
+}
+
 func BenchmarkUnmarshal_String(b *testing.B) {
 	data := []byte(`foo       `)
 	var v struct {

diff --git a/decode.go b/decode.go
@@ -8,6 +8,7 @@ import (
 	"io"
 	"reflect"
 	"strconv"
+	"unicode/utf8"
 )
 
 // Unmarshal parses fixed width encoded data and stores the
@@ -19,8 +20,9 @@ func Unmarshal(data []byte, v interface{}) error {
 
 // A Decoder reads and decodes fixed width data from an input stream.
 type Decoder struct {
-	data *bufio.Reader
-	done bool
+	data                *bufio.Reader
+	done                bool
+	useCodepointIndices bool
 }
 
 // NewDecoder returns a new decoder that reads from r.
@@ -70,6 +72,13 @@ func (e *UnmarshalTypeError) Error() string {
 	return s
 }
 
+// SetUseCodepointIndices configures `Decoder` on whether the indices in the
+// `fixedwidth` struct tags are expressed in terms of bytes (the default
+// behavior) or in terms of UTF-8 decoded codepoints.
+func (d *Decoder) SetUseCodepointIndices(use bool) {
+	d.useCodepointIndices = use
+}
+
 // Decode reads from its input and stores the decoded data to the value
 // pointed to by v.
 //
@@ -117,6 +126,47 @@ func (d *Decoder) readLines(v reflect.Value) (err error) {
 	return nil
 }
 
+type rawValue struct {
+	bytes []byte
+	// Used when `SetUseCodepointIndices` has been called on `Decoder`. A
+	// mapping of codepoint indices into the bytes. So the
+	// `codepointIndices[n]` is the starting position for the n-th codepoint in
+	// `bytes`.
+	codepointIndices []int
+}
+
+func newRawValue(bytes []byte, useCodepointIndices bool) (rawValue, error) {
+	value := rawValue{
+		bytes: bytes,
+	}
+	if useCodepointIndices {
+		bytesIdx := 0
+		// Lazily allocate this only if the value actually contains a
+		// multi-byte character.
+		codepointIndices := []int(nil)
+		for bytesIdx < len(bytes) {
+			_, codepointSize := utf8.DecodeRune(bytes[bytesIdx:])
+			if codepointSize == 0 {
+				return rawValue{}, errors.New("fixedwidth: Invalid codepoint")
+			}
+			// We have a multi-byte codepoint, we need to allocate
+			// codepointIndices
+			if codepointIndices == nil && codepointSize > 1 {
+				codepointIndices = make([]int, bytesIdx)
+				for i := 0; i < bytesIdx; i++ {
+					codepointIndices[i] = i
+				}
+			}
+			if codepointIndices != nil {
+				codepointIndices = append(codepointIndices, bytesIdx)
+			}
+			bytesIdx += codepointSize
+		}
+		value.codepointIndices = codepointIndices
+	}
+	return value, nil
+}
+
 func (d *Decoder) readLine(v reflect.Value) (err error, ok bool) {
 	var line []byte
 	line, err = d.data.ReadBytes('\n')
@@ -131,20 +181,45 @@ func (d *Decoder) readLine(v reflect.Value) (err error, ok bool) {
 			return nil, false
 		}
 	}
-	return newValueSetter(v.Type())(v, line), true
+	rawValue, err := newRawValue(line, d.useCodepointIndices)
+	if err != nil {
+		return
+	}
+	return newValueSetter(v.Type())(v, rawValue), true
 }
 
-func rawValueFromLine(line []byte, startPos, endPos int) []byte {
-	if len(line) == 0 || startPos > len(line) {
-		return []byte{}
-	}
-	if endPos > len(line) {
-		endPos = len(line)
+func rawValueFromLine(value rawValue, startPos, endPos int) rawValue {
+	if value.codepointIndices != nil {
+		if len(value.codepointIndices) == 0 || startPos > len(value.codepointIndices) {
+			return rawValue{bytes: []byte{}}
+		}
+		var relevantIndices []int
+		var lineBytes []byte
+		if endPos >= len(value.codepointIndices) {
+			relevantIndices = value.codepointIndices[startPos-1:]
+			lineBytes = value.bytes[relevantIndices[0]:]
+		} else {
+			relevantIndices = value.codepointIndices[startPos-1 : endPos]
+			lineBytes = value.bytes[relevantIndices[0]:value.codepointIndices[endPos]]
+		}
+		return rawValue{
+			bytes:            bytes.TrimSpace(lineBytes),
+			codepointIndices: relevantIndices,
+		}
+	} else {
+		if len(value.bytes) == 0 || startPos > len(value.bytes) {
+			return rawValue{bytes: []byte{}}
+		}
+		if endPos > len(value.bytes) {
+			endPos = len(value.bytes)
+		}
+		return rawValue{
+			bytes: bytes.TrimSpace(value.bytes[startPos-1 : endPos]),
+		}
 	}
-	return bytes.TrimSpace(line[startPos-1 : endPos])
 }
 
-type valueSetter func(v reflect.Value, raw []byte) error
+type valueSetter func(v reflect.Value, raw rawValue) error
 
 var textUnmarshalerType = reflect.TypeOf(new(encoding.TextUnmarshaler)).Elem()
 
@@ -175,7 +250,7 @@ func newValueSetter(t reflect.Type) valueSetter {
 	return unknownSetter
 }
 
-func structSetter(v reflect.Value, raw []byte) error {
+func structSetter(v reflect.Value, raw rawValue) error {
 	t := v.Type()
 	for i := 0; i < v.NumField(); i++ {
 		fv := v.Field(i)
@@ -190,41 +265,41 @@ func structSetter(v reflect.Value, raw []byte) error {
 		rawValue := rawValueFromLine(raw, startPos, endPos)
 		err := newValueSetter(sf.Type)(fv, rawValue)
 		if err != nil {
-			return &UnmarshalTypeError{string(rawValue), sf.Type, t.Name(), sf.Name, err}
+			return &UnmarshalTypeError{string(rawValue.bytes), sf.Type, t.Name(), sf.Name, err}
 		}
 	}
 	return nil
 }
 
-func unknownSetter(v reflect.Value, raw []byte) error {
+func unknownSetter(v reflect.Value, raw rawValue) error {
 	return errors.New("fixedwidth: unknown type")
 }
 
-func nilSetter(v reflect.Value, _ []byte) error {
+func nilSetter(v reflect.Value, _ rawValue) error {
 	v.Set(reflect.Zero(v.Type()))
 	return nil
 }
 
 func textUnmarshalerSetter(t reflect.Type, shouldAddr bool) valueSetter {
-	return func(v reflect.Value, raw []byte) error {
+	return func(v reflect.Value, raw rawValue) error {
 		if shouldAddr {
 			v = v.Addr()
 		}
 		// set to zero value if this is nil
 		if t.Kind() == reflect.Ptr && v.IsNil() {
 			v.Set(reflect.New(t.Elem()))
 		}
-		return v.Interface().(encoding.TextUnmarshaler).UnmarshalText(raw)
+		return v.Interface().(encoding.TextUnmarshaler).UnmarshalText(raw.bytes)
 	}
 }
 
-func interfaceSetter(v reflect.Value, raw []byte) error {
+func interfaceSetter(v reflect.Value, raw rawValue) error {
 	return newValueSetter(v.Elem().Type())(v.Elem(), raw)
 }
 
 func ptrSetter(t reflect.Type) valueSetter {
-	return func(v reflect.Value, raw []byte) error {
-		if len(raw) <= 0 {
+	return func(v reflect.Value, raw rawValue) error {
+		if len(raw.bytes) <= 0 {
 			return nilSetter(v, raw)
 		}
 		if v.IsNil() {
@@ -234,16 +309,16 @@ func ptrSetter(t reflect.Type) valueSetter {
 	}
 }
 
-func stringSetter(v reflect.Value, raw []byte) error {
-	v.SetString(string(raw))
+func stringSetter(v reflect.Value, raw rawValue) error {
+	v.SetString(string(raw.bytes))
 	return nil
 }
 
-func intSetter(v reflect.Value, raw []byte) error {
-	if len(raw) < 1 {
+func intSetter(v reflect.Value, raw rawValue) error {
+	if len(raw.bytes) < 1 {
 		return nil
 	}
-	i, err := strconv.Atoi(string(raw))
+	i, err := strconv.Atoi(string(raw.bytes))
 	if err != nil {
 		return err
 	}
@@ -252,11 +327,11 @@ func intSetter(v reflect.Value, raw []byte) error {
 }
 
 func floatSetter(bitSize int) valueSetter {
-	return func(v reflect.Value, raw []byte) error {
-		if len(raw) < 1 {
+	return func(v reflect.Value, raw rawValue) error {
+		if len(raw.bytes) < 1 {
 			return nil
 		}
-		f, err := strconv.ParseFloat(string(raw), bitSize)
+		f, err := strconv.ParseFloat(string(raw.bytes), bitSize)
 		if err != nil {
 			return err
 		}

diff --git a/decode_test.go b/decode_test.go
@@ -211,7 +211,7 @@ func TestNewValueSetter(t *testing.T) {
 			// ensure we have an addressable target
 			var i = reflect.Indirect(reflect.New(reflect.TypeOf(tt.expected)))
 
-			err := newValueSetter(i.Type())(i, tt.raw)
+			err := newValueSetter(i.Type())(i, rawValue{bytes: tt.raw})
 			if tt.shouldErr != (err != nil) {
 				t.Errorf("newValueSetter(%s)() err want %v, have %v (%v)", reflect.TypeOf(tt.expected).Name(), tt.shouldErr, err != nil, err.Error())
 			}
@@ -222,6 +222,55 @@ func TestNewValueSetter(t *testing.T) {
 	}
 }
 
+func TestDecodeSetUseCodepointIndices(t *testing.T) {
+	type S struct {
+		A string `fixed:"1,5"`
+		B string `fixed:"6,10"`
+		C string `fixed:"11,15"`
+	}
+
+	for _, tt := range []struct {
+		name     string
+		raw      []byte
+		expected S
+	}{
+		{
+			name:     "All ASCII characters",
+			raw:      []byte("ABCD EFGH IJKL \n"),
+			expected: S{"ABCD", "EFGH", "IJKL"},
+		},
+		{
+			name:     "Multi-byte characters",
+			raw:      []byte("ABCD ☃☃   EFG  \n"),
+			expected: S{"ABCD", "☃☃", "EFG"},
+		},
+		{
+			name:     "Truncated with multi-byte characters",
+			raw:      []byte("☃☃\n"),
+			expected: S{"☃☃", "", ""},
+		},
+		{
+			name:     "Multi-byte characters",
+			raw:      []byte("PIÑA DEFGHIJKLM"),
+			expected: S{"PIÑA", "DEFGH", "IJKLM"},
+		},
+	} {
+		t.Run(tt.name, func(t *testing.T) {
+			d := NewDecoder(bytes.NewReader(tt.raw))
+			d.SetUseCodepointIndices(true)
+			var s S
+			err := d.Decode(&s)
+			if err != nil {
+				t.Errorf("Unexpected err: %v", err)
+			}
+			if !reflect.DeepEqual(tt.expected, s) {
+				t.Errorf("Decode(%v) want %v, have %v", tt.raw, tt.expected, s)
+			}
+		})
+	}
+
+}
+
 // Verify the behavior of Decoder.Decode at the end of a file. See
 // https://github.com/ianlopshire/go-fixedwidth/issues/6 for more details.
 func TestDecode_EOF(t *testing.T) {
@@ -250,3 +299,36 @@ func TestDecode_EOF(t *testing.T) {
 		t.Errorf("Decode should have returned an EOF error. Returned: %v", err)
 	}
 }
+
+func TestNewRawValue(t *testing.T) {
+	for _, tt := range []struct {
+		name     string
+		input    []byte
+		expected []int
+	}{
+		{
+			name:     "All ASCII",
+			input:    []byte("ABC"),
+			expected: []int(nil),
+		},
+		{
+			name:     "All multi-byte",
+			input:    []byte("☃☃☃"),
+			expected: []int{0, 3, 6},
+		},
+		{
+			name:     "Mixed",
+			input:    []byte("abc☃☃☃123"),
+			expected: []int{0, 1, 2, 3, 6, 9, 12, 13, 14},
+		},
+	} {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := newRawValue(tt.input, true)
+			if err != nil {
+				t.Errorf("newRawValue(%v, true): Unexpected error", tt.input)
+			} else if !reflect.DeepEqual(tt.expected, result.codepointIndices) {
+				t.Errorf("newRawValue(%v, true): Unexpected result, expected %v got %v", tt.input, tt.expected, result.codepointIndices)
+			}
+		})
+	}
+}