Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new. add reset callback function #618

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 7 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ jobs:

- name: Test
run: |
go install golang.org/x/lint/golint@latest
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.54.2
OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1)
OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1)
golint -set_exit_status
golangci-lint run
go vet -v ./...
go test -race -v -coverprofile=coverage.txt -covermode=atomic ./...

Expand Down Expand Up @@ -62,18 +62,18 @@ jobs:

- name: Build
run: |
go install golang.org/x/lint/golint@latest
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.54.2
OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1)
OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1)
golint -set_exit_status
golangci-lint run
go build

codecov:
name: Codecov
name: Codecov
runs-on: [ubuntu-latest]
needs:
needs:
- test
- build
steps:
- name: Run Codecov
- name: Run Codecov
run: bash <(curl -s https://codecov.io/bash)
3 changes: 3 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
run:
skip-files:
- ".*\\_test\\.go$"
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func main() {
c := colly.NewCollector()

// Find and visit all links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
c.OnHTML("a[href]", func(s string, e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})

Expand Down
2 changes: 1 addition & 1 deletion _examples/basic/basic.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ func main() {
)

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
c.OnHTML("a[href]", "_", func(_ string, e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
fmt.Printf("Link found: %q -> %s\n", e.Text, link)
Expand Down
6 changes: 3 additions & 3 deletions _examples/coursera_courses/coursera_courses.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func main() {
courses := make([]Course, 0, 200)

// On every <a> element which has "href" attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
c.OnHTML("a[href]", "_", func(_ string, e *colly.HTMLElement) {
// If attribute class is this long string return from callback
// As this a is irrelevant
if e.Attr("class") == "Button_1qxkboh-o_O-primary_cv02ee-o_O-md_28awn8-o_O-primaryLink_109aggg" {
Expand All @@ -67,7 +67,7 @@ func main() {
})

// On every <a> element with collection-product-card class call callback
c.OnHTML(`a.collection-product-card`, func(e *colly.HTMLElement) {
c.OnHTML(`a.collection-product-card`, "_", func(_ string, e *colly.HTMLElement) {
// Activate detailCollector if the link contains "coursera.org/learn"
courseURL := e.Request.AbsoluteURL(e.Attr("href"))
if strings.Index(courseURL, "coursera.org/learn") != -1 {
Expand All @@ -76,7 +76,7 @@ func main() {
})

// Extract details of the course
detailCollector.OnHTML(`div[id=rendered-content]`, func(e *colly.HTMLElement) {
detailCollector.OnHTML(`div[id=rendered-content]`, "_", func(_ string, e *colly.HTMLElement) {
log.Println("Course found", e.Request.URL)
title := e.ChildText(".banner-title")
if title == "" {
Expand Down
2 changes: 1 addition & 1 deletion _examples/cryptocoinmarketcap/cryptocoinmarketcap.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ func main() {
// Instantiate default collector
c := colly.NewCollector()

c.OnHTML("tbody tr", func(e *colly.HTMLElement) {
c.OnHTML("tbody tr", "_", func(_ string, e *colly.HTMLElement) {
writer.Write([]string{
e.ChildText(".cmc-table__column-name"),
e.ChildText(".cmc-table__cell--sort-by__symbol"),
Expand Down
2 changes: 1 addition & 1 deletion _examples/error_handling/error_handling.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ func main() {

// Set HTML callback
// Won't be called if error occurs
c.OnHTML("*", func(e *colly.HTMLElement) {
c.OnHTML("*", "_", func(_ string, e *colly.HTMLElement) {
fmt.Println(e)
})

Expand Down
2 changes: 1 addition & 1 deletion _examples/factba.se/factbase.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func main() {

d := c.Clone()

d.OnHTML("body", func(e *colly.HTMLElement) {
d.OnHTML("body", "_", func(_ string, e *colly.HTMLElement) {
t := make([]transcript, 0)
e.ForEach(".topic-media-row", func(_ int, el *colly.HTMLElement) {
t = append(t, transcript{
Expand Down
6 changes: 3 additions & 3 deletions _examples/google_groups/google_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func main() {
mailCollector := colly.NewCollector()

// Collect threads
threadCollector.OnHTML("tr", func(e *colly.HTMLElement) {
threadCollector.OnHTML("tr", "_", func(_ string, e *colly.HTMLElement) {
ch := e.DOM.Children()
author := ch.Eq(1).Text()
// deleted topic
Expand All @@ -49,13 +49,13 @@ func main() {
})

// Visit next page
threadCollector.OnHTML("body > a[href]", func(e *colly.HTMLElement) {
threadCollector.OnHTML("body > a[href]", "_", func(_ string, e *colly.HTMLElement) {
log.Println("Next page link found:", e.Attr("href"))
e.Request.Visit(e.Attr("href"))
})

// Extract mails
mailCollector.OnHTML("body", func(e *colly.HTMLElement) {
mailCollector.OnHTML("body", "_", func(_ string, e *colly.HTMLElement) {
// Find subject
threadSubject := e.ChildText("h2")
if _, ok := threads[threadSubject]; !ok {
Expand Down
2 changes: 1 addition & 1 deletion _examples/hackernews_comments/hackernews_comments.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func main() {
c := colly.NewCollector()

// Extract comment
c.OnHTML(".comment-tree tr.athing", func(e *colly.HTMLElement) {
c.OnHTML(".comment-tree tr.athing", "_", func(_ string, e *colly.HTMLElement) {
width, err := strconv.Atoi(e.ChildAttr("td.ind img", "width"))
if err != nil {
return
Expand Down
2 changes: 1 addition & 1 deletion _examples/instagram/instagram.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func main() {
}
})

c.OnHTML("html", func(e *colly.HTMLElement) {
c.OnHTML("html", "_", func(_ string, e *colly.HTMLElement) {
d := c.Clone()
d.OnResponse(func(r *colly.Response) {
requestIds = queryIdPattern.FindAll(r.Body, -1)
Expand Down
4 changes: 2 additions & 2 deletions _examples/local_files/local_files.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ func main() {

pages := []string{}

c.OnHTML("h1", func(e *colly.HTMLElement) {
c.OnHTML("h1", "_", func(_ string, e *colly.HTMLElement) {
pages = append(pages, e.Text)
})

c.OnHTML("a", func(e *colly.HTMLElement) {
c.OnHTML("a", "_", func(_ string, e *colly.HTMLElement) {
c.Visit("file://" + dir + "/html" + e.Attr("href"))
})

Expand Down
2 changes: 1 addition & 1 deletion _examples/max_depth/max_depth.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ func main() {
)

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
c.OnHTML("a[href]", "_", func(_ string, e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
fmt.Println(link)
Expand Down
2 changes: 1 addition & 1 deletion _examples/multipart/multipart.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func main() {
c := colly.NewCollector(colly.AllowURLRevisit(), colly.MaxDepth(5))

// On every a element which has href attribute call callback
c.OnHTML("html", func(e *colly.HTMLElement) {
c.OnHTML("html", "_", func(_ string, e *colly.HTMLElement) {
fmt.Println(e.Text)
time.Sleep(1 * time.Second)
e.Request.PostMultipart("http://localhost:8080/", generateFormData())
Expand Down
4 changes: 2 additions & 2 deletions _examples/openedx_courses/openedx_courses.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ func main() {
courses := make([]Course, 0, 200)

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
c.OnHTML("a[href]", "_", func(_ string, e *colly.HTMLElement) {
link := e.Attr("href")
if !strings.HasPrefix(link, "/courses/") {
return
Expand All @@ -46,7 +46,7 @@ func main() {
e.Request.Visit(link)
})

c.OnHTML("div[class=main-container]", func(e *colly.HTMLElement) {
c.OnHTML("div[class=main-container]", "_", func(_ string, e *colly.HTMLElement) {
if e.DOM.Find("section#course-info").Length() == 0 {
return
}
Expand Down
2 changes: 1 addition & 1 deletion _examples/parallel/parallel.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func main() {
c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2})

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
c.OnHTML("a[href]", "_", func(_ string, e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
fmt.Println(link)
Expand Down
4 changes: 2 additions & 2 deletions _examples/reddit/reddit.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ func main() {

// On every a element which has .top-matter attribute call callback
// This class is unique to the div that holds all information about a story
c.OnHTML(".top-matter", func(e *colly.HTMLElement) {
c.OnHTML(".top-matter", "_", func(_ string, e *colly.HTMLElement) {
temp := item{}
temp.StoryURL = e.ChildAttr("a[data-event-action=title]", "href")
temp.Source = "https://old.reddit.com/r/programming/"
Expand All @@ -40,7 +40,7 @@ func main() {
})

// On every span tag with the class next-button
c.OnHTML("span.next-button", func(h *colly.HTMLElement) {
c.OnHTML("span.next-button", "_", func(_ string, h *colly.HTMLElement) {
t := h.ChildAttr("a", "href")
c.Visit(t)
})
Expand Down
2 changes: 1 addition & 1 deletion _examples/scraper_server/scraper_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ func handler(w http.ResponseWriter, r *http.Request) {
p := &pageInfo{Links: make(map[string]int)}

// count links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
c.OnHTML("a[href]", "_", func(_ string, e *colly.HTMLElement) {
link := e.Request.AbsoluteURL(e.Attr("href"))
if link != "" {
p.Links[link]++
Expand Down
2 changes: 1 addition & 1 deletion _examples/url_filter/url_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ func main() {
)

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
c.OnHTML("a[href]", "_", func(_ string, e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
fmt.Printf("Link found: %q -> %s\n", e.Text, link)
Expand Down
4 changes: 2 additions & 2 deletions _examples/xkcd_store/xkcd_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func main() {
)

// Extract product details
c.OnHTML(".product-grid-item", func(e *colly.HTMLElement) {
c.OnHTML(".product-grid-item", "_", func(_ string, e *colly.HTMLElement) {
writer.Write([]string{
e.ChildAttr("a", "title"),
e.ChildText("span"),
Expand All @@ -38,7 +38,7 @@ func main() {
})

// Find and visit next page links
c.OnHTML(`.next a[href]`, func(e *colly.HTMLElement) {
c.OnHTML(`.next a[href]`, "_", func(_ string, e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})

Expand Down
6 changes: 3 additions & 3 deletions cmd/colly/colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ var scraperEndTemplate = `
`

var htmlCallbackTemplate = `
c.OnHTML("element-selector", func(e *colly.HTMLElement) {
c.OnHTML("element-selector", "unique-key", func(_ string, e *colly.HTMLElement) {
log.Println(e.Text)
})
`
Expand Down Expand Up @@ -113,9 +113,9 @@ func main() {
}
}
scraper.WriteString(scraperEndTemplate)
outfile.Write(scraper.Bytes())
_, _ = outfile.Write(scraper.Bytes())
}
})

app.Run(os.Args)
_ = app.Run(os.Args)
}