Skip to content

Commit

Permalink
Merge pull request #808 from WGH-/content-sniffing
Browse files Browse the repository at this point in the history
Implement content sniffing for HTML parsing
  • Loading branch information
asciimoo committed Mar 27, 2024
2 parents c8b9cba + bad50ff commit 5224b97
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 2 deletions.
20 changes: 19 additions & 1 deletion colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -1117,9 +1117,27 @@ func (c *Collector) handleOnResponseHeaders(r *Response) {
}

func (c *Collector) handleOnHTML(resp *Response) error {
if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
if len(c.htmlCallbacks) == 0 {
return nil
}

contentType := resp.Headers.Get("Content-Type")
if contentType == "" {
contentType = http.DetectContentType(resp.Body)
}
// implementation of mime.ParseMediaType without parsing the params
// part
mediatype, _, _ := strings.Cut(contentType, ";")
mediatype = strings.TrimSpace(strings.ToLower(mediatype))

// TODO we also want to parse application/xml as XHTML if it has
// appropriate doctype
switch mediatype {
case "text/html", "application/xhtml+xml":
default:
return nil
}

doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
if err != nil {
return err
Expand Down
34 changes: 33 additions & 1 deletion colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ func newUnstartedTestServer() *httptest.Server {
})

mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
if r.URL.Query().Get("no-content-type") != "" {
w.Header()["Content-Type"] = nil
} else {
w.Header().Set("Content-Type", "text/html")
}
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
Expand Down Expand Up @@ -627,6 +631,34 @@ func TestCollectorOnHTML(t *testing.T) {
}
}

func TestCollectorContentSniffing(t *testing.T) {
ts := newTestServer()
defer ts.Close()

c := NewCollector()

htmlCallbackCalled := false

c.OnResponse(func(r *Response) {
if (*r.Headers)["Content-Type"] != nil {
t.Error("Content-Type unexpectedly not nil")
}
})

c.OnHTML("html", func(e *HTMLElement) {
htmlCallbackCalled = true
})

err := c.Visit(ts.URL + "/html?no-content-type=yes")
if err != nil {
t.Fatal(err)
}

if !htmlCallbackCalled {
t.Error("OnHTML was not called")
}
}

func TestCollectorURLRevisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
Expand Down

0 comments on commit 5224b97

Please sign in to comment.