Skip to content

Commit

Permalink
fix: drop incorrect namespace handling (#15)
Browse files Browse the repository at this point in the history
  • Loading branch information
muktihari authored Jun 30, 2024
1 parent 71c4417 commit 1750cca
Show file tree
Hide file tree
Showing 12 changed files with 112 additions and 83 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
[![CodeCov](https://codecov.io/gh/muktihari/xmltokenizer/branch/master/graph/badge.svg)](https://codecov.io/gh/muktihari/xmltokenizer)
[![Go Report Card](https://goreportcard.com/badge/github.com/muktihari/xmltokenizer)](https://goreportcard.com/report/github.com/muktihari/xmltokenizer)

XML Tokenizer is a low-memory high performance library for parsing simple XML 1.0. This is an alternative option to the standard library's xml when speed is your main concern. This may not cover all XML files, but it can cover typical XML files.
XML Tokenizer is a low-memory high performance non-namespace parser library for parsing simple XML 1.0. This is an alternative option to the standard library's xml when speed is your main concern and you are willing to sacrifice certain features, such as handling the namespace, in favor of speed ([discussion](https://www.reddit.com/r/golang/comments/1drdji3/xml_tokenizer_thats_4x_faster_than_stdlibs_xml/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button)). This may not cover all XML files, but it can cover typical XML files.

# Motivation

Expand Down
4 changes: 2 additions & 2 deletions docs/USAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token
if token.IsEndElementOf(se) { // Reach desired EndElement
return nil
}
if token.IsEndElement() { // Ignore child's EndElements
if token.IsEndElement { // Ignore child's EndElements
continue
}
switch string(token.Name.Local) {
Expand Down Expand Up @@ -145,7 +145,7 @@ func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Toke
if token.IsEndElementOf(se) { // Reach desired EndElement
return nil
}
if token.IsEndElement() { // Ignore child's EndElements
if token.IsEndElement { // Ignore child's EndElements
continue
}
switch string(token.Name.Local) {
Expand Down
2 changes: 1 addition & 1 deletion internal/gpx/schema/extensions.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func (t *TrackpointExtension) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xm
if token.IsEndElementOf(se) {
return nil
}
if token.IsEndElement() {
if token.IsEndElement {
continue
}

Expand Down
2 changes: 1 addition & 1 deletion internal/gpx/schema/gpx.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func (g *GPX) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token
if token.IsEndElementOf(se) {
return nil
}
if token.IsEndElement() {
if token.IsEndElement {
continue
}

Expand Down
6 changes: 3 additions & 3 deletions internal/gpx/schema/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ func (m *Metadata) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.
if token.IsEndElementOf(se) {
return nil
}
if token.IsEndElement() {
if token.IsEndElement {
continue
}

Expand Down Expand Up @@ -123,7 +123,7 @@ func (a *Author) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.To
if token.IsEndElementOf(se) {
return nil
}
if token.IsEndElement() {
if token.IsEndElement {
continue
}

Expand Down Expand Up @@ -199,7 +199,7 @@ func (a *Link) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Toke
if token.IsEndElementOf(se) {
return nil
}
if token.IsEndElement() {
if token.IsEndElement {
continue
}

Expand Down
6 changes: 3 additions & 3 deletions internal/gpx/schema/track.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ func (t *Track) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Tok
if token.IsEndElementOf(se) {
return nil
}
if token.IsEndElement() {
if token.IsEndElement {
continue
}

Expand Down Expand Up @@ -98,7 +98,7 @@ func (t *TrackSegment) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokeni
if token.IsEndElementOf(se) {
return nil
}
if token.IsEndElement() {
if token.IsEndElement {
continue
}

Expand Down Expand Up @@ -186,7 +186,7 @@ func (w *Waypoint) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.
if token.IsEndElementOf(se) {
return nil
}
if token.IsEndElement() {
if token.IsEndElement {
continue
}

Expand Down
4 changes: 2 additions & 2 deletions internal/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token
if token.IsEndElementOf(se) { // Reach desired EndElement
return nil
}
if token.IsEndElement() { // Ignore child's EndElements
if token.IsEndElement { // Ignore child's EndElements
continue
}
switch string(token.Name.Local) {
Expand Down Expand Up @@ -123,7 +123,7 @@ func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Toke
if token.IsEndElementOf(se) { // Reach desired EndElement
return nil
}
if token.IsEndElement() { // Ignore child's EndElements
if token.IsEndElement { // Ignore child's EndElements
continue
}
switch string(token.Name.Local) {
Expand Down
6 changes: 3 additions & 3 deletions internal/xlsx/schema/sheet.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ func (s *SheetData) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer
if token.IsEndElementOf(se) {
break
}
if token.IsEndElement() {
if token.IsEndElement {
continue
}

Expand Down Expand Up @@ -67,7 +67,7 @@ func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token
if token.IsEndElementOf(se) {
break
}
if token.IsEndElement() {
if token.IsEndElement {
continue
}

Expand Down Expand Up @@ -127,7 +127,7 @@ func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Toke
if token.IsEndElementOf(se) {
break
}
if token.IsEndElement() {
if token.IsEndElement {
continue
}

Expand Down
43 changes: 18 additions & 25 deletions token.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,29 +22,21 @@ func PutToken(t *Token) { pool.Put(t) }
// <!ELEMENT library (book+)>
// <!ELEMENT book (title, author, year)>
// ]>
//
// Token includes CharData or CDATA in Data field when it appears right after the start element.
type Token struct {
Name Name // Name can be a StartElement: "name", a EndElement: "/name" or empty when a tag starts with "<?" or "<!" (except "<![CDATA").
Attrs []Attr // Attrs exist when len(Attrs) > 0.
Data []byte // Data could be a CharData or a CDATA, or maybe a RawToken if a tag starts with "<?" or "<!" (except "<![CDATA").
SelfClosing bool // True when a tag ends with "/>" e.g. <c r="E3" s="1" />. Also true when a tag starts with "<?" or "<!" (except "<![CDATA").
}

// IsEndElement checks whether the given token represent an end element (closing tag),
// name start with '/'. e.g. </gpx>
func (t *Token) IsEndElement() bool {
if len(t.Name.Full) > 0 && t.Name.Full[0] == '/' {
return true
}
return false
Name Name // Name is an XML name, empty when a tag starts with "<?" or "<!".
Attrs []Attr // Attrs exist when len(Attrs) > 0.
Data []byte // Data could be a CharData or a CDATA, or maybe a RawToken if a tag starts with "<?" or "<!" (except "<![CDATA").
SelfClosing bool // True when a tag ends with "/>" e.g. <c r="E3" s="1" />. Also true when a tag starts with "<?" or "<!" (except "<![CDATA").
IsEndElement bool // True when a tag start with "</" e.g. </gpx> or </gpxtpx:atemp>.
}

// IsEndElementOf checks whether the given token represent a
// n end element (closing tag) of given startElement.
func (t *Token) IsEndElementOf(t2 *Token) bool {
if !t.IsEndElement() {
return false
}
if string(t.Name.Full[1:]) == string(t2.Name.Full) {
// n end element (closing tag) of given StartElement.
func (t *Token) IsEndElementOf(se *Token) bool {
if t.IsEndElement &&
string(t.Name.Full) == string(se.Name.Full) {
return true
}
return false
Expand All @@ -53,12 +45,13 @@ func (t *Token) IsEndElementOf(t2 *Token) bool {
// Copy copies src Token into t, returning t. Attrs should be
// consumed immediately since it's only being shallow copied.
func (t *Token) Copy(src Token) *Token {
t.Name.Space = append(t.Name.Space[:0], src.Name.Space...)
t.Name.Prefix = append(t.Name.Prefix[:0], src.Name.Prefix...)
t.Name.Local = append(t.Name.Local[:0], src.Name.Local...)
t.Name.Full = append(t.Name.Full[:0], src.Name.Full...)
t.Attrs = append(t.Attrs[:0], src.Attrs...) // shallow copy
t.Data = append(t.Data[:0], src.Data...)
t.SelfClosing = src.SelfClosing
t.IsEndElement = src.IsEndElement
return t
}

Expand All @@ -68,10 +61,10 @@ type Attr struct {
Value []byte
}

// Name represents an XML name (Local) annotated
// with a name space identifier (Space).
// Name represents an XML name <prefix:local>,
// we don't manage the bookkeeping of namespaces.
type Name struct {
Space []byte
Local []byte
Full []byte // Full is combination of "space:local"
Prefix []byte
Local []byte
Full []byte // Full is combination of "prefix:local"
}
20 changes: 11 additions & 9 deletions token_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ func TestIsEndElement(t *testing.T) {
name: "an end element",
token: xmltokenizer.Token{
Name: xmltokenizer.Name{
Full: []byte("/worksheet"),
Full: []byte("worksheet"),
},
IsEndElement: true,
},
expected: true,
},
Expand All @@ -54,7 +55,7 @@ func TestIsEndElement(t *testing.T) {

for _, tc := range tt {
t.Run(tc.name, func(t *testing.T) {
if r := tc.token.IsEndElement(); r != tc.expected {
if r := tc.token.IsEndElement; r != tc.expected {
t.Fatalf("expected: %t, got: %t", tc.expected, r)
}
})
Expand All @@ -71,8 +72,9 @@ func TestIsEndElementOf(t *testing.T) {
name: "correct end element",
t1: xmltokenizer.Token{
Name: xmltokenizer.Name{
Full: []byte("/worksheet"),
Full: []byte("worksheet"),
},
IsEndElement: true,
},
t2: xmltokenizer.Token{
Name: xmltokenizer.Name{
Expand Down Expand Up @@ -123,15 +125,15 @@ func TestIsEndElementOf(t *testing.T) {
func TestCopy(t *testing.T) {
t1 := xmltokenizer.Token{
Name: xmltokenizer.Name{
Space: []byte("gpxtpx"),
Local: []byte("hr"),
Full: []byte("gpxtpx:hr"),
Prefix: []byte("gpxtpx"),
Local: []byte("hr"),
Full: []byte("gpxtpx:hr"),
},
Attrs: []xmltokenizer.Attr{{
Name: xmltokenizer.Name{
Space: nil,
Local: []byte("units"),
Full: []byte("units"),
Prefix: nil,
Local: []byte("units"),
Full: []byte("units"),
},
Value: []byte("bpm"),
}},
Expand Down
21 changes: 13 additions & 8 deletions tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ const (
const (
defaultReadBufferSize = 4 << 10
autoGrowBufferMaxLimitSize = 1000 << 10
defaultAttrsBufferSize = 8
defaultAttrsBufferSize = 16
)

// Tokenizer is a XML tokenizer.
Expand Down Expand Up @@ -283,12 +283,13 @@ func (t *Tokenizer) manageBuffer() error {
}

func (t *Tokenizer) clearToken() {
t.token.Name.Space = nil
t.token.Name.Prefix = nil
t.token.Name.Local = nil
t.token.Name.Full = nil
t.token.Attrs = t.token.Attrs[:0]
t.token.Data = nil
t.token.SelfClosing = false
t.token.IsEndElement = false
}

// consumeNonTagIdentifier consumes identifier starts with "<?" or "<!", make it raw data.
Expand All @@ -303,13 +304,17 @@ func (t *Tokenizer) consumeNonTagIdentifier(b []byte) []byte {

func (t *Tokenizer) consumeTagName(b []byte) []byte {
var pos, fullpos int
for i := range b {
for i := 0; i < len(b); i++ {
switch b[i] {
case '<':
if b[i+1] == '/' {
t.token.IsEndElement = true
i++
}
pos = i + 1
fullpos = i + 1
case ':':
t.token.Name.Space = trim(b[pos:i])
t.token.Name.Prefix = trim(b[pos:i])
pos = i + 1
case '>', ' ': // e.g. <gpx>, <trkpt lat="-7.1872750" lon="110.3450230">
if b[i] == '>' && b[i-1] == '/' { // In case we encounter <name/>
Expand All @@ -324,14 +329,14 @@ func (t *Tokenizer) consumeTagName(b []byte) []byte {
}

func (t *Tokenizer) consumeAttrs(b []byte) []byte {
var space, local, full []byte
var prefix, local, full []byte
var pos, fullpos int
var inquote bool
for i := range b {
switch b[i] {
case ':':
if !inquote {
space = trim(b[pos:i])
prefix = trim(b[pos:i])
pos = i + 1
}
case '=':
Expand All @@ -345,10 +350,10 @@ func (t *Tokenizer) consumeAttrs(b []byte) []byte {
continue
}
t.token.Attrs = append(t.token.Attrs, Attr{
Name: Name{Space: space, Local: local, Full: full},
Name: Name{Prefix: prefix, Local: local, Full: full},
Value: trim(b[pos+1 : i]),
})
space, local, full = nil, nil, nil
prefix, local, full = nil, nil, nil
pos = i + 1
fullpos = i + 1
}
Expand Down
Loading

0 comments on commit 1750cca

Please sign in to comment.