Skip to content

Commit

Permalink
Don't decompress gzip if data doesn't look like gzip
Browse files Browse the repository at this point in the history
Prevents incorrect response being returned in cases like
/sitemap.xml.gz is requested, but uncompressed 404 page is served
instead.

Thanks-to: Seth Davis <seth@xyplanningnetwork.com>
  • Loading branch information
WGH- committed Nov 21, 2023
1 parent 1b000c4 commit d5f1ff8
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 3 deletions.
68 changes: 68 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ const testXML = `<?xml version="1.0" encoding="UTF-8"?>
<paragraph type="description">This is a test paragraph</paragraph>
</page>`

const custom404 = `404 not found`

func newUnstartedTestServer() *httptest.Server {
mux := http.NewServeMux()

Expand Down Expand Up @@ -86,6 +88,14 @@ func newUnstartedTestServer() *httptest.Server {
ww.Write([]byte(testXML))
})

mux.HandleFunc("/nonexistent.xml.gz", func(w http.ResponseWriter, r *http.Request) {
http.Error(w, custom404, http.StatusNotFound)
})

mux.HandleFunc("/empty-response.xml.gz", func(w http.ResponseWriter, r *http.Request) {
// write nothing
})

mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) {
if r.Method == "POST" {
w.Header().Set("Content-Type", "text/html")
Expand Down Expand Up @@ -1477,6 +1487,64 @@ func TestCollectorOnXMLWithXMLCompressed(t *testing.T) {
testCollectorOnXMLWithXML(t, "/test.xml.gz")
}

func TestCollectorNonexistentXMLGZ(t *testing.T) {
// This is a regression test for colly
// attempting to decompress all .xml.gz URLs
// even if they're not compressed.
ts := newTestServer()
defer ts.Close()

c := NewCollector(ParseHTTPErrorResponse())

onResponseCalled := false

c.OnResponse(func(resp *Response) {
onResponseCalled = true
if got, want := strings.TrimSpace(string(resp.Body)), custom404; got != want {
t.Errorf("wrong response body got=%q want=%q", got, want)
}
})

c.OnError(func(resp *Response, err error) {
t.Errorf("called on OnError: err=%v", err)
})

c.Visit(ts.URL + "/nonexistent.xml.gz")

if !onResponseCalled {
t.Error("OnResponse was not called")
}
}

func TestCollectorEmptyXMLGZ(t *testing.T) {
// This is a regression test for colly
// attempting to decompress all .xml.gz URLs
// even if they're not compressed.
ts := newTestServer()
defer ts.Close()

c := NewCollector()

onResponseCalled := false

c.OnResponse(func(resp *Response) {
onResponseCalled = true
if got, want := strings.TrimSpace(string(resp.Body)), ""; got != want {
t.Errorf("wrong response body got=%q want=%q", got, want)
}
})

c.OnError(func(resp *Response, err error) {
t.Errorf("called on OnError: err=%v", err)
})

c.Visit(ts.URL + "/empty-response.xml.gz")

if !onResponseCalled {
t.Error("OnResponse was not called")
}
}

func TestCollectorVisitWithTrace(t *testing.T) {
ts := newTestServer()
defer ts.Close()
Expand Down
23 changes: 20 additions & 3 deletions http_backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package colly

import (
"bufio"
"crypto/sha1"
"encoding/gob"
"encoding/hex"
Expand Down Expand Up @@ -202,11 +203,27 @@ func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc c
}
contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding"))
if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) {
bodyReader, err = gzip.NewReader(bodyReader)
if err != nil {
// Even if URL contains .xml.gz, it doesn't mean that we get gzip
// compressed data back. We might get 404 error page instead,
// for example. So check gzip magic bytes.
bufReader := bufio.NewReader(bodyReader)
bodyReader = bufReader
magic, err := bufReader.Peek(2)
switch err {
case io.EOF:
// less than 2 bytes, do nothing
case nil:
// gzip magic, as specified in RFC 1952
if magic[0] == 0x1f && magic[1] == 0x8b {
bodyReader, err = gzip.NewReader(bufReader)
if err != nil {
return nil, err
}
defer bodyReader.(*gzip.Reader).Close()
}
default:
return nil, err
}
defer bodyReader.(*gzip.Reader).Close()
}
body, err := io.ReadAll(bodyReader)
if err != nil {
Expand Down

0 comments on commit d5f1ff8

Please sign in to comment.