Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better xml parsing #157

Merged
merged 4 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -467,10 +467,11 @@ func (c *Crawl) Capture(item *queue.Item) error {

outlinks = append(outlinks, URLsFromS3...)
} else {
URLsFromXML, isSitemap, err := extractor.XML(resp)
URLsFromXML, isSitemap, err := extractor.XML(resp, false)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from XML")
} else {
}
if len(URLsFromXML) > 0 {
if isSitemap {
outlinks = append(outlinks, URLsFromXML...)
} else {
Expand Down
37 changes: 19 additions & 18 deletions internal/pkg/crawl/extractor/xml.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,36 +9,37 @@ import (
"strings"
)

func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) {
var sitemapMarker = []byte("sitemaps.org/schemas/sitemap/")

func XML(resp *http.Response, strict bool) (URLs []*url.URL, sitemap bool, err error) {
xmlBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, sitemap, err
}

if strings.Contains(string(xmlBody), "sitemaps.org/schemas/sitemap/") {
if bytes.Contains(xmlBody, sitemapMarker) {
sitemap = true
}

reader := bytes.NewReader(xmlBody)
decoder := xml.NewDecoder(reader)

// try to decode one token to see if stream is open
_, err = decoder.Token()
if err != nil {
return nil, sitemap, err
}

// seek back to 0 if we are still here
reader.Seek(0, 0)
decoder = xml.NewDecoder(reader)
decoder := xml.NewDecoder(bytes.NewReader(xmlBody))
decoder.Strict = strict

var tok xml.Token
for {
tok, err := decoder.Token()
if err == io.EOF {
if strict {
tok, err = decoder.Token()
} else {
tok, err = decoder.RawToken()
}

if tok == nil && err == io.EOF {
// normal EOF
break
}

if err != nil {
return nil, sitemap, err
// return URLs we got so far when error occurs
return URLs, sitemap, err
}

switch tok := tok.(type) {
Expand All @@ -52,7 +53,7 @@ func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) {
}
}
case xml.CharData:
if strings.HasPrefix(string(tok), "http") {
if bytes.HasPrefix(tok, []byte("http")) {
parsedURL, err := url.Parse(string(tok))
if err == nil {
URLs = append(URLs, parsedURL)
Expand Down
158 changes: 105 additions & 53 deletions internal/pkg/crawl/extractor/xml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,26 @@ package extractor

import (
"bytes"
"encoding/xml"
"io"
"net/http"
"net/url"
"os"
"strings"
"testing"
)

func TestXML(t *testing.T) {
tests := []struct {
name string
xmlBody string
wantURLs []*url.URL
wantURLsCount int
wantErr bool
sitemap bool
name string
xmlBody string
wantURLsLax []*url.URL
wantURLsStric []*url.URL
wantURLsCountLax int
wantURLsCountStric int
wantErrLax bool
wantErrStrict bool
sitemap bool
}{
{
name: "Valid XML with URLs",
Expand All @@ -28,26 +33,49 @@ func TestXML(t *testing.T) {
</nested>
<noturl>just some text</noturl>
</root>`,
wantURLs: []*url.URL{
wantURLsLax: []*url.URL{
{Scheme: "http", Host: "example.com"},
{Scheme: "https", Host: "example.org"},
},
wantURLsStric: []*url.URL{
{Scheme: "http", Host: "example.com"},
{Scheme: "https", Host: "example.org"},
},
sitemap: false,
wantErr: false,
},
{
name: "Empty XML",
xmlBody: `<root></root>`,
wantURLs: nil,
wantErr: false,
sitemap: false,
name: "unbalanced XML with URLs",
xmlBody: `
<unbalance>
<url>http://example.com</url>
</unbalance></unbalance></unbalance>
<outsideurl>https://unclosed.example.com</outsideurl>`,
wantURLsStric: []*url.URL{
{Scheme: "http", Host: "example.com"},
},
wantURLsLax: []*url.URL{
{Scheme: "http", Host: "example.com"},
{Scheme: "https", Host: "unclosed.example.com"},
},
wantErrStrict: true,
wantErrLax: false,
sitemap: false,
},
{
name: "Empty XML",
xmlBody: `<root></root>`,
wantURLsStric: nil,
wantURLsLax: nil,
sitemap: false,
},
{
name: "Invalid XML",
xmlBody: `<root><unclosed>`,
wantURLs: nil,
wantErr: true,
sitemap: false,
name: "alien XML",
xmlBody: `<h4 73><?/>/<AS "='AS "ASD@'SD>,as;g^&R$W#Sf)(U><l;rpkv ]])`,
wantURLsStric: nil,
wantURLsLax: nil,
wantErrStrict: true,
wantErrLax: true,
sitemap: false,
},
{
name: "XML with invalid URL",
Expand All @@ -56,48 +84,54 @@ func TestXML(t *testing.T) {
<item>http://example.com</item>
<item>not a valid url</item>
</root>`,
wantURLs: []*url.URL{
wantURLsStric: []*url.URL{
{Scheme: "http", Host: "example.com"},
},
wantErr: false,
sitemap: false,
wantURLsLax: []*url.URL{
{Scheme: "http", Host: "example.com"},
},
wantErrStrict: false,
wantErrLax: false,
sitemap: false,
},
{
name: "Huge sitemap",
xmlBody: loadTestFile(t, "xml_test_sitemap.xml"),
wantURLsCount: 100002,
wantErr: false,
sitemap: true,
name: "Huge sitemap",
xmlBody: loadTestFile(t, "xml_test_sitemap.xml"),
wantURLsCountStric: 100002,
wantURLsCountLax: 100002,
wantErrStrict: false,
wantErrLax: false,
sitemap: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
resp := &http.Response{
Body: io.NopCloser(bytes.NewBufferString(tt.xmlBody)),
}

gotURLs, sitemap, err := XML(resp)
if (err != nil) != tt.wantErr {
t.Errorf("XML() error = %v, wantErr %v", err, tt.wantErr)
return
}

if tt.wantURLsCount != 0 {
if len(gotURLs) != tt.wantURLsCount {
t.Errorf("XML() gotURLs count = %v, want %v", len(gotURLs), tt.wantURLsCount)
testMode := func(strict bool, wantErr bool, wantURLs []*url.URL, wantURLsCount int) {
resp := &http.Response{
Body: io.NopCloser(bytes.NewBufferString(tt.xmlBody)),
}
}

if tt.wantURLs != nil {
if !compareURLs(gotURLs, tt.wantURLs) {
t.Errorf("XML() gotURLs = %v, want %v", gotURLs, tt.wantURLs)
gotURLs, sitemap, err := XML(resp, strict)
if (err != nil) != wantErr {
t.Errorf("XML() strict = %v, error = %v, wantErr %v", strict, err, wantErr)
return
}
if wantURLsCount != 0 && len(gotURLs) != wantURLsCount {
t.Errorf("XML() strict = %v, gotURLs count = %v, want %v", strict, len(gotURLs), wantURLsCount)
}
if wantURLs != nil && !compareURLs(gotURLs, wantURLs) {
t.Errorf("XML() strict = %v, gotURLs = %v, want %v", strict, gotURLs, wantURLs)
}
if tt.sitemap != sitemap {
t.Errorf("XML() strict = %v, sitemap = %v, want %v", strict, sitemap, tt.sitemap)
}
}

if tt.sitemap != sitemap {
t.Errorf("XML() sitemap = %v, want %v", sitemap, tt.sitemap)
}
// Strict mode
testMode(true, tt.wantErrStrict, tt.wantURLsStric, tt.wantURLsCountStric)

// Lax mode
testMode(false, tt.wantErrLax, tt.wantURLsLax, tt.wantURLsCountLax)
})
}
}
Expand All @@ -116,14 +150,32 @@ func loadTestFile(t *testing.T, path string) string {
return string(b)
}

func TestXMLBodyReadError(t *testing.T) {
func TestXMLBodySyntaxEOFErrorStrict(t *testing.T) {
wantErr := xml.SyntaxError{Line: 3, Msg: "unexpected EOF"}
resp := &http.Response{
Body: io.NopCloser(bytes.NewReader([]byte{})), // Empty reader to simulate EOF
Body: io.NopCloser(strings.NewReader(
`<unclosed>
<closed>
</closed> <!-- Syntax EOF here -->`)),
}
resp.Body.Close() // Close the body to simulate a read error

_, _, err := XML(resp)
_, _, err := XML(resp, true)
if err == nil {
t.Errorf("XML() expected error, got nil")
t.Errorf("XML() error = %v, wantErr %v", err, wantErr)
return
}
if err.Error() != wantErr.Error() {
t.Errorf("XML() error = %v, wantErr %v", err, wantErr)
}
}

func TestXMLBodySyntaxEOFErrorLax(t *testing.T) {
resp := &http.Response{
Body: io.NopCloser(strings.NewReader(`<unclosed>
<closed>
</closed> <!-- ignore Syntax EOF here -->`)),
}
_, _, err := XML(resp, false)
if err != nil {
t.Errorf("XML() error = %v, wantErr nil", err)
}
}