Skip to content

Commit

Permalink
Implement content sniffing for HTML parsing
Browse files Browse the repository at this point in the history
Web pages can be served without Content-Type set, in which case
browsers employ content sniffing. Do the same here, in Colly.

While we're at it, change the Content-Type check to something stricter than
mere "html" substring match.
  • Loading branch information
WGH- committed Mar 25, 2024
1 parent 4ccfe78 commit 69cc94a
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 2 deletions.
21 changes: 20 additions & 1 deletion colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"hash/fnv"
"io"
"log"
"mime"
"net/http"
"net/http/cookiejar"
"net/url"
Expand Down Expand Up @@ -1117,9 +1118,27 @@ func (c *Collector) handleOnResponseHeaders(r *Response) {
}

func (c *Collector) handleOnHTML(resp *Response) error {
if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
if len(c.htmlCallbacks) == 0 {
return nil
}

contentType := resp.Headers.Get("Content-Type")
if contentType == "" {
contentType = http.DetectContentType(resp.Body)
}
mediaType, _, err := mime.ParseMediaType(contentType)
if err != nil && err != mime.ErrInvalidMediaParameter {
return fmt.Errorf("malformed Content-Type header value: %w", err)
}

// TODO we also want to parse application/xml as XHTML if has
// appropriate doctype
switch mediaType {
case "text/html", "application/xhtml+xml":
default:
return nil
}

doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
if err != nil {
return err
Expand Down
34 changes: 33 additions & 1 deletion colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ func newUnstartedTestServer() *httptest.Server {
})

mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
if r.URL.Query().Get("no-content-type") != "" {
w.Header()["Content-Type"] = nil
} else {
w.Header().Set("Content-Type", "text/html")
}
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
Expand Down Expand Up @@ -627,6 +631,34 @@ func TestCollectorOnHTML(t *testing.T) {
}
}

func TestCollectorContentSniffing(t *testing.T) {
ts := newTestServer()
defer ts.Close()

c := NewCollector()

htmlCallbackCalled := false

c.OnResponse(func(r *Response) {
if (*r.Headers)["Content-Type"] != nil {
t.Error("Content-Type unexpectedly not nil")
}
})

c.OnHTML("html", func(e *HTMLElement) {
htmlCallbackCalled = true
})

err := c.Visit(ts.URL + "/html?no-content-type=yes")
if err != nil {
t.Fatal(err)
}

if !htmlCallbackCalled {
t.Error("OnHTML was not called")
}
}

func TestCollectorURLRevisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
Expand Down

0 comments on commit 69cc94a

Please sign in to comment.