From aa2bbf6c126a0c695c7d53cf721ccd61f0e11491 Mon Sep 17 00:00:00 2001 From: Ian Lopshire Date: Fri, 7 Jan 2022 11:13:53 -0500 Subject: [PATCH] Add multibyte character support to encoder --- README.md | 15 ++++-- encode.go | 140 +++++++++++++++++++++++++++++++------------------ encode_test.go | 77 ++++++++++++++++++++++++++- tags.go | 15 +++++- tags_test.go | 20 +++++++ 5 files changed, 211 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index 405a7e0..cbab4d0 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,6 @@ if err != nil { fmt.Printf("%s", data) // Output: // 1 Ian Lopshire 99.5020 true - ``` ### Decode @@ -95,8 +94,10 @@ for { } ``` -If you have an input where the indices are expressed in unicode codepoints, and -not raw bytes fixedwidth supports this. Your data must be UTF-8 encoded: +### UTF-8, Codepoints, and Multibyte Characters + +fixedwidth supports encoding and decoding fixed-width data where indices are expressed in +unicode codepoints and not raw bytes. The data must be UTF-8 encoded. ```go decoder := fixedwidth.NewDecoder(strings.NewReader(data)) @@ -104,6 +105,14 @@ decoder.SetUseCodepointIndices(true) // Decode as usual now ``` + +```go +buff := new(bytes.Buffer) +encoder := fixedwidth.NewEncoder(buff) +encoder.SetUseCodepointIndices(true) +// Encode as usual now +``` + ### Alignment Behavior | Alignment | Encoding | Decoding | diff --git a/encode.go b/encode.go index 60d8a1b..704cb41 100644 --- a/encode.go +++ b/encode.go @@ -7,6 +7,7 @@ import ( "io" "reflect" "strconv" + "strings" ) // Marshal returns the fixed-width encoding of v. @@ -60,6 +61,8 @@ func (e *MarshalInvalidTypeError) Error() string { type Encoder struct { w *bufio.Writer lineTerminator []byte + + useCodepointIndices bool } // NewEncoder returns a new encoder that writes to w. @@ -77,6 +80,13 @@ func (e *Encoder) SetLineTerminator(lineTerminator []byte) { e.lineTerminator = lineTerminator } +// SetUseCodepointIndices configures `Encoder` on whether the indices in the +// `fixedwidth` struct tags are expressed in terms of bytes (the default +// behavior) or in terms of UTF-8 decoded codepoints. +func (e *Encoder) SetUseCodepointIndices(use bool) { + e.useCodepointIndices = use +} + // Encode writes the fixed-width encoding of v to the // stream. // See the documentation for Marshal for details about @@ -122,31 +132,31 @@ func (e *Encoder) writeLines(v reflect.Value) error { } func (e *Encoder) writeLine(v reflect.Value) (err error) { - b, err := newValueEncoder(v.Type())(v) + b, err := newValueEncoder(v.Type(), e.useCodepointIndices)(v) if err != nil { return err } - _, err = e.w.Write(b) + _, err = e.w.WriteString(b.data) return err } -type valueEncoder func(v reflect.Value) ([]byte, error) +type valueEncoder func(v reflect.Value) (rawValue, error) -func newValueEncoder(t reflect.Type) valueEncoder { +func newValueEncoder(t reflect.Type, useCodepointIndices bool) valueEncoder { if t == nil { return nilEncoder } if t.Implements(reflect.TypeOf(new(encoding.TextMarshaler)).Elem()) { - return textMarshalerEncoder + return textMarshalerEncoder(useCodepointIndices) } switch t.Kind() { case reflect.Ptr, reflect.Interface: - return ptrInterfaceEncoder + return ptrInterfaceEncoder(useCodepointIndices) case reflect.Struct: - return structEncoder + return structEncoder(useCodepointIndices) case reflect.String: - return stringEncoder + return stringEncoder(useCodepointIndices) case reflect.Int, reflect.Int64, reflect.Int32, reflect.Int16, reflect.Int8: return intEncoder case reflect.Float64: @@ -161,18 +171,20 @@ func newValueEncoder(t reflect.Type) valueEncoder { return unknownTypeEncoder(t) } -func (ve valueEncoder) Write(v reflect.Value, dst []byte, format format) error { +func (ve valueEncoder) Write(b *lineBuilder, v reflect.Value, spec fieldSpec) error { + format := spec.format + startIndex := spec.startPos - 1 value, err := ve(v) if err != nil { return err } - if len(value) < len(dst) { + if value.len() < spec.len() { switch { - case format.alignment == right: - padding := bytes.Repeat([]byte{format.padChar}, len(dst)-len(value)) - copy(dst, padding) - copy(dst[len(padding):], value) + case spec.format.alignment == right: + padding := strings.Repeat(string(format.padChar), spec.len()-value.len()) + b.WriteASCII(startIndex, padding) + b.WriteValue(startIndex+len(padding), value) return nil // The second case in this block is a special case to maintain backward @@ -180,74 +192,102 @@ func (ve valueEncoder) Write(v reflect.Value, dst []byte, format format) error { // written to dst. This means overlapping intervals can, in effect, be used to // coalesce a value. case format.alignment == left, format.alignment == defaultAlignment && format.padChar != ' ': - padding := bytes.Repeat([]byte{format.padChar}, len(dst)-len(value)) - copy(dst, value) - copy(dst[len(value):], padding) + padding := strings.Repeat(string(format.padChar), spec.len()-value.len()) + + b.WriteValue(startIndex, value) + b.WriteASCII(startIndex+value.len(), padding) return nil } } - copy(dst, value) + if value.len() > spec.len() { + // If the value is too long it needs to be trimmed. + // TODO: Add strict mode that returns in this case. + value, err = value.slice(0, spec.len()-1) + if err != nil { + return err + } + } + + b.WriteValue(startIndex, value) return nil } -func structEncoder(v reflect.Value) ([]byte, error) { - ss := cachedStructSpec(v.Type()) - dst := bytes.Repeat([]byte(" "), ss.ll) +func structEncoder(useCodepointIndices bool) valueEncoder { + return func(v reflect.Value) (rawValue, error) { + ss := cachedStructSpec(v.Type()) - for i, spec := range ss.fieldSpecs { - if !spec.ok { - continue + // Add a 10% headroom to the builder when codepoint indices are being used. + c := ss.ll + if useCodepointIndices { + c = int(1.1*float64(ss.ll)) + 1 } + b := newLineBuilder(ss.ll, c, ' ') - err := spec.encoder.Write(v.Field(i), dst[spec.startPos-1:spec.endPos:spec.endPos], spec.format) - if err != nil { - return nil, err + for i, spec := range ss.fieldSpecs { + if !spec.ok { + continue + } + + enc := spec.getEncoder(useCodepointIndices) + err := enc.Write(b, v.Field(i), spec) + if err != nil { + return rawValue{}, err + } } - } - return dst, nil + return b.AsRawValue(), nil + } } -func textMarshalerEncoder(v reflect.Value) ([]byte, error) { - return v.Interface().(encoding.TextMarshaler).MarshalText() +func textMarshalerEncoder(useCodepointIndices bool) valueEncoder { + return func(v reflect.Value) (rawValue, error) { + txt, err := v.Interface().(encoding.TextMarshaler).MarshalText() + if err != nil { + return rawValue{}, err + } + return newRawValue(string(txt), useCodepointIndices) + } } -func ptrInterfaceEncoder(v reflect.Value) ([]byte, error) { - if v.IsNil() { - return nilEncoder(v) +func ptrInterfaceEncoder(useCodepointIndices bool) valueEncoder { + return func(v reflect.Value) (rawValue, error) { + if v.IsNil() { + return nilEncoder(v) + } + return newValueEncoder(v.Elem().Type(), useCodepointIndices)(v.Elem()) } - return newValueEncoder(v.Elem().Type())(v.Elem()) } -func stringEncoder(v reflect.Value) ([]byte, error) { - return []byte(v.String()), nil +func stringEncoder(useCodepointIndices bool) valueEncoder { + return func(v reflect.Value) (rawValue, error) { + return newRawValue(v.String(), useCodepointIndices) + } } - -func intEncoder(v reflect.Value) ([]byte, error) { - return []byte(strconv.Itoa(int(v.Int()))), nil +func intEncoder(v reflect.Value) (rawValue, error) { + return newRawValue(strconv.Itoa(int(v.Int())), false) } func floatEncoder(perc, bitSize int) valueEncoder { - return func(v reflect.Value) ([]byte, error) { - return []byte(strconv.FormatFloat(v.Float(), 'f', perc, bitSize)), nil + return func(v reflect.Value) (rawValue, error) { + return newRawValue(strconv.FormatFloat(v.Float(), 'f', perc, bitSize), false) } } -func boolEncoder(v reflect.Value) ([]byte, error) { - return []byte(strconv.FormatBool(v.Bool())), nil +func boolEncoder(v reflect.Value) (rawValue, error) { + return newRawValue(strconv.FormatBool(v.Bool()), false) } -func nilEncoder(v reflect.Value) ([]byte, error) { - return nil, nil +func nilEncoder(_ reflect.Value) (rawValue, error) { + return rawValue{}, nil } func unknownTypeEncoder(t reflect.Type) valueEncoder { - return func(value reflect.Value) ([]byte, error) { - return nil, &MarshalInvalidTypeError{typeName: t.Name()} + return func(value reflect.Value) (rawValue, error) { + return rawValue{}, &MarshalInvalidTypeError{typeName: t.Name()} } } -func uintEncoder(v reflect.Value) ([]byte, error) { - return []byte(strconv.FormatUint(v.Uint(), 10)), nil +func uintEncoder(v reflect.Value) (rawValue, error) { + return newRawValue(strconv.FormatUint(v.Uint(), 10), false) } diff --git a/encode_test.go b/encode_test.go index 7d94978..861aef6 100644 --- a/encode_test.go +++ b/encode_test.go @@ -59,6 +59,9 @@ func TestMarshal(t *testing.T) { F1 interface{} `fixed:"1,5"` F2 interface{} `fixed:"6,10"` } + type H2 struct { + F1 bool `fixed:"1,1"` + } tagHelper := struct { Valid string `fixed:"1,5"` NoTags string @@ -76,6 +79,7 @@ func TestMarshal(t *testing.T) { }{ {"single line", H{"foo", 1}, []byte("foo 1 "), false}, {"multiple line", []H{{"foo", 1}, {"bar", 2}}, []byte("foo 1 \nbar 2 "), false}, + {"multiple line (diff struct)", []interface{}{H{"foo", 1}, H2{false}}, []byte("foo 1 \nf"), false}, {"empty slice", []H{}, nil, false}, {"pointer", &H{"foo", 1}, []byte("foo 1 "), false}, {"nil", nil, nil, false}, @@ -90,6 +94,59 @@ func TestMarshal(t *testing.T) { t.Errorf("Marshal() shouldErr expected %v, have %v (%v)", tt.shouldErr, err != nil, err) } if !tt.shouldErr && !bytes.Equal(o, tt.o) { + t.Errorf("Marshal() expected %q, have %q", string(tt.o), string(o)) + } + + // All tests should also pass with codepoint indices enabled. + t.Run("use codepoint indices", func(t *testing.T) { + buff := bytes.NewBuffer(nil) + enc := NewEncoder(buff) + enc.SetUseCodepointIndices(true) + err := enc.Encode(tt.i) + if tt.shouldErr != (err != nil) { + t.Errorf("Marshal() shouldErr expected %v, have %v (%v)", tt.shouldErr, err != nil, err) + } + if !tt.shouldErr && !bytes.Equal(buff.Bytes(), tt.o) { + t.Errorf("Marshal() expected %q, have %q", string(tt.o), string(o)) + } + }) + + }) + } +} + +func TestMarshal_useCodepointIndices(t *testing.T) { + type H struct { + F1 string `fixed:"1,5"` + F2 string `fixed:"6,10"` + F3 string `fixed:"11,15"` + } + + type HF struct { + F1 string `fixed:"1,5,right,#"` + F2 string `fixed:"6,10,left,#"` + F3 string `fixed:"11,15"` + } + + for _, tt := range []struct { + name string + i interface{} + o []byte + }{ + {name: "base case", i: H{"føø", "bår", "båz"}, o: []byte(`føø bår båz `)}, + {name: "overflow", i: H{"føøøøøøøøøø", "bååååååååår", "bååååååååz"}, o: []byte(`føøøøbååååbåååå`)}, + {name: "formatted", i: HF{"føø", "bår", "båz"}, o: []byte(`##føøbår##båz `)}, + {name: "multibformatted overflow", i: HF{"føøøøøøøøøø", "bååååååååår", "bååååååååz"}, o: []byte(`føøøøbååååbåååå`)}, + } { + t.Run(tt.name, func(t *testing.T) { + buff := bytes.NewBuffer(nil) + enc := NewEncoder(buff) + enc.SetUseCodepointIndices(true) + if err := enc.Encode(tt.i); err != nil { + t.Errorf("Marshal() unexpected error: %v", err) + return + } + if o := buff.Bytes(); !bytes.Equal(o, tt.o) { t.Errorf("Marshal() expected %s, have %s", tt.o, o) } }) @@ -130,6 +187,22 @@ func TestMarshal_format(t *testing.T) { want: []byte(`12345` + `12345` + `12345` + `12345` + `12345` + `12345`), shouldErr: false, }, + { + name: "pad right", + v: struct { + F1 string `fixed:"1,5,right,#"` + }{"foo"}, + want: []byte(`##foo`), + shouldErr: false, + }, + { + name: "pad left", + v: struct { + F1 string `fixed:"1,5,left,#"` + }{"foo"}, + want: []byte(`foo##`), + shouldErr: false, + }, } { t.Run(tt.name, func(t *testing.T) { have, err := Marshal(tt.v) @@ -228,11 +301,11 @@ func TestNewValueEncoder(t *testing.T) { {"*uint nil", nilUint, []byte(""), false}, } { t.Run(tt.name, func(t *testing.T) { - o, err := newValueEncoder(reflect.TypeOf(tt.i))(reflect.ValueOf(tt.i)) + o, err := newValueEncoder(reflect.TypeOf(tt.i), false)(reflect.ValueOf(tt.i)) if tt.shouldErr != (err != nil) { t.Errorf("newValueEncoder(%s)() shouldErr expected %v, have %v (%v)", reflect.TypeOf(tt.i).Name(), tt.shouldErr, err != nil, err) } - if !tt.shouldErr && !bytes.Equal(o, tt.o) { + if !tt.shouldErr && !bytes.Equal([]byte(o.data), tt.o) { t.Errorf("newValueEncoder(%s)() expected %v, have %v", reflect.TypeOf(tt.i).Name(), tt.o, o) } }) diff --git a/tags.go b/tags.go index 004f68c..2020e79 100644 --- a/tags.go +++ b/tags.go @@ -64,11 +64,23 @@ type structSpec struct { type fieldSpec struct { startPos, endPos int encoder valueEncoder + codepointEncoder valueEncoder setter valueSetter format format ok bool } +func (s fieldSpec) len() int { + return s.endPos - s.startPos + 1 +} + +func (s fieldSpec) getEncoder(useCodepointIndices bool) valueEncoder { + if useCodepointIndices { + return s.codepointEncoder + } + return s.encoder +} + func buildStructSpec(t reflect.Type) structSpec { ss := structSpec{ fieldSpecs: make([]fieldSpec, t.NumField()), @@ -90,7 +102,8 @@ func buildStructSpec(t reflect.Type) structSpec { ss.ll = ss.fieldSpecs[i].endPos } - ss.fieldSpecs[i].encoder = newValueEncoder(f.Type) + ss.fieldSpecs[i].encoder = newValueEncoder(f.Type, false) + ss.fieldSpecs[i].codepointEncoder = newValueEncoder(f.Type, true) ss.fieldSpecs[i].setter = newValueSetter(f.Type) } return ss diff --git a/tags_test.go b/tags_test.go index c4a0500..b29cce2 100644 --- a/tags_test.go +++ b/tags_test.go @@ -1,6 +1,7 @@ package fixedwidth import ( + "fmt" "reflect" "testing" ) @@ -52,3 +53,22 @@ func TestParseTag(t *testing.T) { }) } } + +func TestFieldSpec_len(t *testing.T) { + for _, tt := range []struct { + spec fieldSpec + want int + }{ + {fieldSpec{startPos: 1, endPos: 1}, 1}, + {fieldSpec{startPos: 1, endPos: 5}, 5}, + {fieldSpec{startPos: 5, endPos: 5}, 1}, + {fieldSpec{startPos: 6, endPos: 10}, 5}, + } { + t.Run(fmt.Sprintf("%v to %v", tt.spec.startPos, tt.spec.endPos), func(t *testing.T) { + if l := tt.spec.len(); l != tt.want { + t.Errorf("len() expected %v, have %v", tt.want, l) + + } + }) + } +}