Skip to content

Commit

Permalink
Don't decompress gzip if data doesn't look like gzip
Browse files Browse the repository at this point in the history
Prevents incorrect response being returned in cases like
/sitemap.xml.gz is requested, but uncompressed 404 page is served
instead.
  • Loading branch information
WGH- committed Jan 16, 2023
1 parent 9ef9229 commit 50c9eeb
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 5 deletions.
41 changes: 38 additions & 3 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,15 @@ Disallow: /disallowed
Disallow: /allowed*q=
`

const testXml = `<?xml version="1.0" encoding="UTF-8"?>
const testXML = `<?xml version="1.0" encoding="UTF-8"?>
<page>
<title>Test Page</title>
<paragraph type="description">This is a test page</paragraph>
<paragraph type="description">This is a test paragraph</paragraph>
</page>`

const custom404 = `404 not found`

func newTestServer() *httptest.Server {
mux := http.NewServeMux()

Expand All @@ -77,13 +79,17 @@ func newTestServer() *httptest.Server {

mux.HandleFunc("/xml", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/xml")
w.Write([]byte(testXml))
w.Write([]byte(testXML))
})

mux.HandleFunc("/test.xml.gz", func(w http.ResponseWriter, r *http.Request) {
ww := gzip.NewWriter(w)
defer ww.Close()
ww.Write([]byte(testXml))
ww.Write([]byte(testXML))
})

mux.HandleFunc("/nonexistent.xml.gz", func(w http.ResponseWriter, r *http.Request) {
http.Error(w, custom404, http.StatusNotFound)
})

mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) {
Expand Down Expand Up @@ -1431,6 +1437,35 @@ func TestCollectorOnXMLWithXMLCompressed(t *testing.T) {
testCollectorOnXMLWithXML(t, "/test.xml.gz")
}

func TestCollectorNonexistentXMLGZ(t *testing.T) {
// This is a regression test for colly
// attempting to decompress all .xml.gz URLs
// even if they're not compressed.
ts := newTestServer()
defer ts.Close()

c := NewCollector(ParseHTTPErrorResponse())

onResponseCalled := false

c.OnResponse(func(resp *Response) {
onResponseCalled = true
if got, want := strings.TrimSpace(string(resp.Body)), custom404; got != want {
t.Errorf("wrong response body got=%q want=%q", got, want)
}
})

c.OnError(func(resp *Response, err error) {
t.Errorf("called on OnError: err=%v", err)
})

c.Visit(ts.URL + "/nonexistent.xml.gz")

if !onResponseCalled {
t.Error("OnResponse was not called")
}
}

func TestCollectorVisitWithTrace(t *testing.T) {
ts := newTestServer()
defer ts.Close()
Expand Down
17 changes: 15 additions & 2 deletions http_backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package colly

import (
"bufio"
"crypto/sha1"
"encoding/gob"
"encoding/hex"
Expand Down Expand Up @@ -201,11 +202,23 @@ func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc c
}
contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding"))
if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(request.URL.Path), ".xml.gz")) {
bodyReader, err = gzip.NewReader(bodyReader)
// Even if URL contains .xml.gz, it doesn't mean that we get gzip
// compressed data back. We might get 404 error page instead,
// for example. So check gzip magic bytes.
bufReader := bufio.NewReader(bodyReader)
bodyReader = bufReader
magic, err := bufReader.Peek(2)
if err != nil {
return nil, err
}
defer bodyReader.(*gzip.Reader).Close()
// gzip magic, as specified in RFC 1952
if magic[0] == 0x1f && magic[1] == 0x8b {
bodyReader, err = gzip.NewReader(bufReader)
if err != nil {
return nil, err
}
defer bodyReader.(*gzip.Reader).Close()
}
}
body, err := ioutil.ReadAll(bodyReader)
if err != nil {
Expand Down

0 comments on commit 50c9eeb

Please sign in to comment.