fix: drop incorrect namespace handling (#15)

muktihari · Jun 30, 2024 · 1750cca · 1750cca
1 parent 71c4417
commit 1750cca
Show file tree

Hide file tree

Showing 12 changed files with 112 additions and 83 deletions.
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 [![CodeCov](https://codecov.io/gh/muktihari/xmltokenizer/branch/master/graph/badge.svg)](https://codecov.io/gh/muktihari/xmltokenizer)
 [![Go Report Card](https://goreportcard.com/badge/github.com/muktihari/xmltokenizer)](https://goreportcard.com/report/github.com/muktihari/xmltokenizer)
 
-XML Tokenizer is a low-memory high performance library for parsing simple XML 1.0. This is an alternative option to the standard library's xml when speed is your main concern. This may not cover all XML files, but it can cover typical XML files.
+XML Tokenizer is a low-memory high performance non-namespace parser library for parsing simple XML 1.0. This is an alternative option to the standard library's xml when speed is your main concern and you are willing to sacrifice certain features, such as handling the namespace, in favor of speed ([discussion](https://www.reddit.com/r/golang/comments/1drdji3/xml_tokenizer_thats_4x_faster_than_stdlibs_xml/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button)). This may not cover all XML files, but it can cover typical XML files.
 
 # Motivation
 

diff --git a/docs/USAGE.md b/docs/USAGE.md
@@ -99,7 +99,7 @@ func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token
         if token.IsEndElementOf(se) { // Reach desired EndElement
             return nil
         }
-        if token.IsEndElement() { // Ignore child's EndElements
+        if token.IsEndElement { // Ignore child's EndElements
             continue
         }
         switch string(token.Name.Local) {
@@ -145,7 +145,7 @@ func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Toke
         if token.IsEndElementOf(se) { // Reach desired EndElement
             return nil
         }
-        if token.IsEndElement() { // Ignore child's EndElements
+        if token.IsEndElement { // Ignore child's EndElements
             continue
         }
         switch string(token.Name.Local) {

diff --git a/internal/gpx/schema/extensions.go b/internal/gpx/schema/extensions.go
@@ -38,7 +38,7 @@ func (t *TrackpointExtension) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xm
 		if token.IsEndElementOf(se) {
 			return nil
 		}
-		if token.IsEndElement() {
+		if token.IsEndElement {
 			continue
 		}
 

diff --git a/internal/gpx/schema/gpx.go b/internal/gpx/schema/gpx.go
@@ -35,7 +35,7 @@ func (g *GPX) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token
 		if token.IsEndElementOf(se) {
 			return nil
 		}
-		if token.IsEndElement() {
+		if token.IsEndElement {
 			continue
 		}
 

diff --git a/internal/gpx/schema/metadata.go b/internal/gpx/schema/metadata.go
@@ -27,7 +27,7 @@ func (m *Metadata) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.
 		if token.IsEndElementOf(se) {
 			return nil
 		}
-		if token.IsEndElement() {
+		if token.IsEndElement {
 			continue
 		}
 
@@ -123,7 +123,7 @@ func (a *Author) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.To
 		if token.IsEndElementOf(se) {
 			return nil
 		}
-		if token.IsEndElement() {
+		if token.IsEndElement {
 			continue
 		}
 
@@ -199,7 +199,7 @@ func (a *Link) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Toke
 		if token.IsEndElementOf(se) {
 			return nil
 		}
-		if token.IsEndElement() {
+		if token.IsEndElement {
 			continue
 		}
 

diff --git a/internal/gpx/schema/track.go b/internal/gpx/schema/track.go
@@ -26,7 +26,7 @@ func (t *Track) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Tok
 		if token.IsEndElementOf(se) {
 			return nil
 		}
-		if token.IsEndElement() {
+		if token.IsEndElement {
 			continue
 		}
 
@@ -98,7 +98,7 @@ func (t *TrackSegment) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokeni
 		if token.IsEndElementOf(se) {
 			return nil
 		}
-		if token.IsEndElement() {
+		if token.IsEndElement {
 			continue
 		}
 
@@ -186,7 +186,7 @@ func (w *Waypoint) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.
 		if token.IsEndElementOf(se) {
 			return nil
 		}
-		if token.IsEndElement() {
+		if token.IsEndElement {
 			continue
 		}
 

diff --git a/internal/main.go b/internal/main.go
@@ -77,7 +77,7 @@ func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token
 		if token.IsEndElementOf(se) { // Reach desired EndElement
 			return nil
 		}
-		if token.IsEndElement() { // Ignore child's EndElements
+		if token.IsEndElement { // Ignore child's EndElements
 			continue
 		}
 		switch string(token.Name.Local) {
@@ -123,7 +123,7 @@ func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Toke
 		if token.IsEndElementOf(se) { // Reach desired EndElement
 			return nil
 		}
-		if token.IsEndElement() { // Ignore child's EndElements
+		if token.IsEndElement { // Ignore child's EndElements
 			continue
 		}
 		switch string(token.Name.Local) {

diff --git a/internal/xlsx/schema/sheet.go b/internal/xlsx/schema/sheet.go
@@ -21,7 +21,7 @@ func (s *SheetData) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer
 		if token.IsEndElementOf(se) {
 			break
 		}
-		if token.IsEndElement() {
+		if token.IsEndElement {
 			continue
 		}
 
@@ -67,7 +67,7 @@ func (r *Row) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Token
 		if token.IsEndElementOf(se) {
 			break
 		}
-		if token.IsEndElement() {
+		if token.IsEndElement {
 			continue
 		}
 
@@ -127,7 +127,7 @@ func (c *Cell) UnmarshalToken(tok *xmltokenizer.Tokenizer, se *xmltokenizer.Toke
 		if token.IsEndElementOf(se) {
 			break
 		}
-		if token.IsEndElement() {
+		if token.IsEndElement {
 			continue
 		}
 

diff --git a/token.go b/token.go
@@ -22,29 +22,21 @@ func PutToken(t *Token) { pool.Put(t) }
 //     <!ELEMENT library (book+)>
 //     <!ELEMENT book (title, author, year)>
 //     ]>
+//
+// Token includes CharData or CDATA in Data field when it appears right after the start element.
 type Token struct {
-	Name        Name   // Name can be a StartElement: "name", a EndElement: "/name" or empty when a tag starts with "<?" or "<!" (except "<![CDATA").
-	Attrs       []Attr // Attrs exist when len(Attrs) > 0.
-	Data        []byte // Data could be a CharData or a CDATA, or maybe a RawToken if a tag starts with "<?" or "<!" (except "<![CDATA").
-	SelfClosing bool   // True when a tag ends with "/>" e.g. <c r="E3" s="1" />. Also true when a tag starts with "<?" or "<!" (except "<![CDATA").
-}
-
-// IsEndElement checks whether the given token represent an end element (closing tag),
-// name start with '/'. e.g. </gpx>
-func (t *Token) IsEndElement() bool {
-	if len(t.Name.Full) > 0 && t.Name.Full[0] == '/' {
-		return true
-	}
-	return false
+	Name         Name   // Name is an XML name, empty when a tag starts with "<?" or "<!".
+	Attrs        []Attr // Attrs exist when len(Attrs) > 0.
+	Data         []byte // Data could be a CharData or a CDATA, or maybe a RawToken if a tag starts with "<?" or "<!" (except "<![CDATA").
+	SelfClosing  bool   // True when a tag ends with "/>" e.g. <c r="E3" s="1" />. Also true when a tag starts with "<?" or "<!" (except "<![CDATA").
+	IsEndElement bool   // True when a tag start with "</" e.g. </gpx> or </gpxtpx:atemp>.
 }
 
 // IsEndElementOf checks whether the given token represent a
-// n end element (closing tag) of given startElement.
-func (t *Token) IsEndElementOf(t2 *Token) bool {
-	if !t.IsEndElement() {
-		return false
-	}
-	if string(t.Name.Full[1:]) == string(t2.Name.Full) {
+// n end element (closing tag) of given StartElement.
+func (t *Token) IsEndElementOf(se *Token) bool {
+	if t.IsEndElement &&
+		string(t.Name.Full) == string(se.Name.Full) {
 		return true
 	}
 	return false
@@ -53,12 +45,13 @@ func (t *Token) IsEndElementOf(t2 *Token) bool {
 // Copy copies src Token into t, returning t. Attrs should be
 // consumed immediately since it's only being shallow copied.
 func (t *Token) Copy(src Token) *Token {
-	t.Name.Space = append(t.Name.Space[:0], src.Name.Space...)
+	t.Name.Prefix = append(t.Name.Prefix[:0], src.Name.Prefix...)
 	t.Name.Local = append(t.Name.Local[:0], src.Name.Local...)
 	t.Name.Full = append(t.Name.Full[:0], src.Name.Full...)
 	t.Attrs = append(t.Attrs[:0], src.Attrs...) // shallow copy
 	t.Data = append(t.Data[:0], src.Data...)
 	t.SelfClosing = src.SelfClosing
+	t.IsEndElement = src.IsEndElement
 	return t
 }
 
@@ -68,10 +61,10 @@ type Attr struct {
 	Value []byte
 }
 
-// Name represents an XML name (Local) annotated
-// with a name space identifier (Space).
+// Name represents an XML name <prefix:local>,
+// we don't manage the bookkeeping of namespaces.
 type Name struct {
-	Space []byte
-	Local []byte
-	Full  []byte // Full is combination of "space:local"
+	Prefix []byte
+	Local  []byte
+	Full   []byte // Full is combination of "prefix:local"
 }
diff --git a/token_test.go b/token_test.go
@@ -27,8 +27,9 @@ func TestIsEndElement(t *testing.T) {
 			name: "an end element",
 			token: xmltokenizer.Token{
 				Name: xmltokenizer.Name{
-					Full: []byte("/worksheet"),
+					Full: []byte("worksheet"),
 				},
+				IsEndElement: true,
 			},
 			expected: true,
 		},
@@ -54,7 +55,7 @@ func TestIsEndElement(t *testing.T) {
 
 	for _, tc := range tt {
 		t.Run(tc.name, func(t *testing.T) {
-			if r := tc.token.IsEndElement(); r != tc.expected {
+			if r := tc.token.IsEndElement; r != tc.expected {
 				t.Fatalf("expected: %t, got: %t", tc.expected, r)
 			}
 		})
@@ -71,8 +72,9 @@ func TestIsEndElementOf(t *testing.T) {
 			name: "correct end element",
 			t1: xmltokenizer.Token{
 				Name: xmltokenizer.Name{
-					Full: []byte("/worksheet"),
+					Full: []byte("worksheet"),
 				},
+				IsEndElement: true,
 			},
 			t2: xmltokenizer.Token{
 				Name: xmltokenizer.Name{
@@ -123,15 +125,15 @@ func TestIsEndElementOf(t *testing.T) {
 func TestCopy(t *testing.T) {
 	t1 := xmltokenizer.Token{
 		Name: xmltokenizer.Name{
-			Space: []byte("gpxtpx"),
-			Local: []byte("hr"),
-			Full:  []byte("gpxtpx:hr"),
+			Prefix: []byte("gpxtpx"),
+			Local:  []byte("hr"),
+			Full:   []byte("gpxtpx:hr"),
 		},
 		Attrs: []xmltokenizer.Attr{{
 			Name: xmltokenizer.Name{
-				Space: nil,
-				Local: []byte("units"),
-				Full:  []byte("units"),
+				Prefix: nil,
+				Local:  []byte("units"),
+				Full:   []byte("units"),
 			},
 			Value: []byte("bpm"),
 		}},

diff --git a/tokenizer.go b/tokenizer.go
@@ -17,7 +17,7 @@ const (
 const (
 	defaultReadBufferSize      = 4 << 10
 	autoGrowBufferMaxLimitSize = 1000 << 10
-	defaultAttrsBufferSize     = 8
+	defaultAttrsBufferSize     = 16
 )
 
 // Tokenizer is a XML tokenizer.
@@ -283,12 +283,13 @@ func (t *Tokenizer) manageBuffer() error {
 }
 
 func (t *Tokenizer) clearToken() {
-	t.token.Name.Space = nil
+	t.token.Name.Prefix = nil
 	t.token.Name.Local = nil
 	t.token.Name.Full = nil
 	t.token.Attrs = t.token.Attrs[:0]
 	t.token.Data = nil
 	t.token.SelfClosing = false
+	t.token.IsEndElement = false
 }
 
 // consumeNonTagIdentifier consumes identifier starts with "<?" or "<!", make it raw data.
@@ -303,13 +304,17 @@ func (t *Tokenizer) consumeNonTagIdentifier(b []byte) []byte {
 
 func (t *Tokenizer) consumeTagName(b []byte) []byte {
 	var pos, fullpos int
-	for i := range b {
+	for i := 0; i < len(b); i++ {
 		switch b[i] {
 		case '<':
+			if b[i+1] == '/' {
+				t.token.IsEndElement = true
+				i++
+			}
 			pos = i + 1
 			fullpos = i + 1
 		case ':':
-			t.token.Name.Space = trim(b[pos:i])
+			t.token.Name.Prefix = trim(b[pos:i])
 			pos = i + 1
 		case '>', ' ': // e.g. <gpx>, <trkpt lat="-7.1872750" lon="110.3450230">
 			if b[i] == '>' && b[i-1] == '/' { // In case we encounter <name/>
@@ -324,14 +329,14 @@ func (t *Tokenizer) consumeTagName(b []byte) []byte {
 }
 
 func (t *Tokenizer) consumeAttrs(b []byte) []byte {
-	var space, local, full []byte
+	var prefix, local, full []byte
 	var pos, fullpos int
 	var inquote bool
 	for i := range b {
 		switch b[i] {
 		case ':':
 			if !inquote {
-				space = trim(b[pos:i])
+				prefix = trim(b[pos:i])
 				pos = i + 1
 			}
 		case '=':
@@ -345,10 +350,10 @@ func (t *Tokenizer) consumeAttrs(b []byte) []byte {
 					continue
 				}
 				t.token.Attrs = append(t.token.Attrs, Attr{
-					Name:  Name{Space: space, Local: local, Full: full},
+					Name:  Name{Prefix: prefix, Local: local, Full: full},
 					Value: trim(b[pos+1 : i]),
 				})
-				space, local, full = nil, nil, nil
+				prefix, local, full = nil, nil, nil
 				pos = i + 1
 				fullpos = i + 1
 			}