Skip to content

Commit

Permalink
Merge branch 'release/0.10.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
sutra committed Dec 5, 2023
2 parents 19288e9 + 73dd2eb commit 5d55bf3
Show file tree
Hide file tree
Showing 18 changed files with 118 additions and 53 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/))

There are more examples in `webmagic-samples` package.

### Lisence:
### License:

Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0)
Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0)

### Thanks:

Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.9.1</version>
<version>0.10.0</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<properties>
Expand Down
2 changes: 1 addition & 1 deletion src/site/site.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.9</version>
<version>1.11.1</version>
</skin>
<body>
<menu ref="parent" inherit="top" />
Expand Down
2 changes: 1 addition & 1 deletion webmagic-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
62 changes: 47 additions & 15 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,34 @@ public class Page {

private byte[] bytes;

private List<Request> targetRequests = new ArrayList<Request>();
private List<Request> targetRequests = new ArrayList<>();

private String charset;

public Page() {
}

public static Page fail(){
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false}.
*
* @return the page.
* @deprecated Use {@link #fail(Request)} instead.
*/
@Deprecated
public static Page fail() {
return fail(null);
}

/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
* and {@link #request} is specified.
*
* @return the page.
* @since 0.10.0
*/
public static Page fail(Request request){
Page page = new Page();
page.setRequest(request);
page.setDownloadSuccess(false);
return page;
}
Expand Down Expand Up @@ -123,13 +142,7 @@ public List<Request> getTargetRequests() {
* @param requests requests
*/
public void addTargetRequests(Iterable<String> requests) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
}
s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s));
}
addTargetRequests(requests, 0); // Default priority is 0
}

/**
Expand All @@ -139,13 +152,32 @@ public void addTargetRequests(Iterable<String> requests) {
* @param priority priority
*/
public void addTargetRequests(Iterable<String> requests, long priority) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
}
s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s).setPriority(priority));
if(requests == null) {
return;
}

for (String req : requests) {
addRequestIfValid(req, priority);
}
}

/**
* Helper method to add a request if it's valid.
*
* @param url URL to add
* @param priority Priority for the URL
*/
private void addRequestIfValid(String url, long priority) {
if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
return;
}

String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
Request req = new Request(canonicalizedUrl);
if(priority > 0) {
req.setPriority(priority);
}
targetRequests.add(req);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,26 +36,62 @@ public Html download(String url, String charset) {
return (Html) page.getHtml();
}

/**
* @param request the {@link Request}.
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
*/
@Deprecated
protected void onSuccess(Request request) {
}

/**
* @param request the {@link Request}.
* @param task the {@link Task}.
* @since 0.7.6
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
*/
@Deprecated
protected void onSuccess(Request request, Task task) {
this.onSuccess(request);
}

/**
* @param page the {@link Page}.
* @param task the {@link Task}.
* @since 0.10.0
*/
protected void onSuccess(Page page, Task task) {
this.onSuccess(page.getRequest(), task);
}

/**
* @param request the {@link Request}.
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
*/
@Deprecated
protected void onError(Request request) {
}

/**
* @param request the {@link Request}.
* @param task the {@link Task}.
* @param e the exception.
* @since 0.7.6
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
*/
@Deprecated
protected void onError(Request request, Task task, Throwable e) {
this.onError(request);
}

/**
* @param page the {@link Page}.
* @param task the {@link Task}.
* @param e the exception.
* @since 0.10.0
*/
protected void onError(Page page, Task task, Throwable e) {
this.onError(page.getRequest(), task, e);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,18 @@ public Page download(Request request, Task task) {
CloseableHttpClient httpClient = getHttpClient(task.getSite());
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
Page page = Page.fail();
Page page = Page.fail(request);
try {
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);

onSuccess(request, task);
onSuccess(page, task);
logger.info("downloading page success {}", request.getUrl());

return page;
} catch (IOException e) {

onError(request, task, e);
onError(page, task, e);
logger.info("download page {} error", request.getUrl(), e);

return page;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import java.util.ArrayList;
import java.util.List;

import org.apache.commons.collections4.CollectionUtils;

/**
Expand Down Expand Up @@ -55,11 +56,12 @@ public Selectable jsonPath(String jsonPath) {

@Override
public String get() {
if (CollectionUtils.isNotEmpty(all())) {
return all().get(0);
} else {
return null;
}
List<String> sourceTexts = all();
if (CollectionUtils.isNotEmpty(sourceTexts)) {
return sourceTexts.get(0);
}
return null;

}

@Override
Expand Down Expand Up @@ -91,8 +93,9 @@ public Selectable replace(String regex, String replacement) {
}

public String getFirstSourceText() {
if (getSourceTexts() != null && getSourceTexts().size() > 0) {
return getSourceTexts().get(0);
List<String> sourceTexts = getSourceTexts();
if (CollectionUtils.isNotEmpty(sourceTexts)) {
return sourceTexts.get(0);
}
return null;
}
Expand All @@ -104,6 +107,6 @@ public String toString() {

@Override
public boolean match() {
return getSourceTexts() != null && getSourceTexts().size() > 0;
return CollectionUtils.isNotEmpty(getSourceTexts());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
public abstract class NumberUtils {

public static int compareLong(long o1, long o2) {
if (o1 < o2) {
return -1;
} else if (o1 == o2) {
return 0;
} else {
return 1;
}
return Long.compare(o1, o2);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ public static <T> Set<T> newHashSet(T... t){
}

public static <T> List<T> newArrayList(T... t){
List<T> set = new ArrayList<T>(t.length);
List<T> list = new ArrayList<T>(t.length);
for (T t1 : t) {
set.add(t1);
list.add(t1);
}
return set;
return list;
}
}
2 changes: 1 addition & 1 deletion webmagic-coverage/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>

<artifactId>webmagic-coverage</artifactId>
Expand Down
2 changes: 1 addition & 1 deletion webmagic-extension/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public Page download(Request request, Task task) {
logger.info("downloading page: " + request.getUrl());
}

Page page = Page.fail();
Page page = Page.fail(request);
try {
String content = getPage(request);
if (!content.contains("HTTP request failed")) {
Expand All @@ -98,9 +98,9 @@ public Page download(Request request, Task task) {
page.setRequest(request);
page.setStatusCode(200);
}
onSuccess(request, task);
onSuccess(page, task);
} catch (Exception e) {
onError(request, task, e);
onError(page, task, e);
logger.warn("download page {} error", request.getUrl(), e);
}
return page;
Expand Down
2 changes: 1 addition & 1 deletion webmagic-samples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
2 changes: 1 addition & 1 deletion webmagic-saxon/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
2 changes: 1 addition & 1 deletion webmagic-scripts/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
2 changes: 1 addition & 1 deletion webmagic-selenium/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ public SeleniumDownloader setSleepTime(int sleepTime) {
public Page download(Request request, Task task) {
checkInit();
WebDriver webDriver = null;
Page page = Page.fail();
Page page = Page.fail(request);
try {
webDriver = webDriverPool.get();

Expand Down Expand Up @@ -111,10 +111,10 @@ public Page download(Request request, Task task) {
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
onSuccess(request, task);
onSuccess(page, task);
} catch (Exception e) {
logger.warn("download page {} error", request.getUrl(), e);
onError(request, task, e);
onError(page, task, e);
} finally {
if (webDriver != null) {
webDriverPool.returnToPool(webDriver);
Expand Down

0 comments on commit 5d55bf3

Please sign in to comment.