code4craft · apaqi · Jul 16, 2018 · Jul 18, 2018 · Aug 23, 2020 · Aug 23, 2020
diff --git a/pom.xml b/pom.xml
@@ -193,6 +193,11 @@
                 <artifactId>jedis</artifactId>
                 <version>2.9.3</version>
             </dependency>
+            <dependency>
+                <groupId>net.jcip</groupId>
+                <artifactId>jcip-annotations</artifactId>
+                <version>1.0</version>
+            </dependency>
         </dependencies>
     </dependencyManagement>
 

diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
@@ -80,7 +80,10 @@
             <groupId>com.alibaba</groupId>
             <artifactId>fastjson</artifactId>
         </dependency>
-
+        <dependency>
+            <groupId>net.jcip</groupId>
+            <artifactId>jcip-annotations</artifactId>
+        </dependency>
     </dependencies>
 
 </project>
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@@ -52,7 +52,7 @@ public class Site {
      *
      * @return new site
      */
-    public static Site me() {
+    public static Site  me() {
         return new Site();
     }
 

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
@@ -32,6 +32,7 @@
 import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
 import org.apache.http.impl.cookie.BasicClientCookie;
 import org.apache.http.protocol.HttpContext;
+import org.apache.http.ssl.SSLContexts;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -142,7 +143,15 @@ public void process(
         connectionManager.setDefaultSocketConfig(socketConfig);
         httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
         generateCookie(httpClientBuilder, site);
-        return httpClientBuilder.build();
+        SSLContext ctx = null;
+        try {
+            ctx = SSLContexts.custom().useProtocol("TLSv1.2").build();
+        } catch (NoSuchAlgorithmException e) {
+            logger.warn("CloseableHttpClient getClient #NoSuchAlgorithmException,{}", e);
+        } catch (KeyManagementException e) {
+            logger.warn("CloseableHttpClient getClient #NoSuchAlgorithmException ,{} error", e);
+        }
+        return httpClientBuilder.create().setSSLContext(ctx).build();
     }
 
     private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java
@@ -1,5 +1,6 @@
 package us.codecraft.webmagic.pipeline;
 
+import net.jcip.annotations.ThreadSafe;
 import org.apache.commons.codec.digest.DigestUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -19,7 +20,9 @@
  * @author code4crafter@gmail.com <br>
  * @since 0.1.0
  */
-public class FilePipeline extends FilePersistentBase implements Pipeline {
+@ThreadSafe
+public class    FilePipeline extends FilePersistentBase implements Pipeline {
+
 
     private Logger logger = LoggerFactory.getLogger(getClass());
 

diff --git a/...c-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java b/...c-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java
@@ -39,7 +39,7 @@ public static void main(String[] args) {
         //multidownload
         List<String> list = new ArrayList<String>();
         list.add(String.format(urlTemplate,"风力发电"));
-        list.add(String.format(urlTemplate,"太阳能"));
+       // list.add(String.format(urlTemplate,"太阳能"));
         list.add(String.format(urlTemplate,"地热发电"));
         list.add(String.format(urlTemplate,"地热发电"));
         List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/DownImgUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/DownImgUtil.java
@@ -0,0 +1,83 @@
+package us.codecraft.webmagic.processor.example;
+
+
+import java.io.*;
+import java.net.HttpURLConnection;
+import java.net.URL;
+
+/**
+ * Created by wpx on 2018/7/17.
+ */
+public class DownImgUtil {
+    /**
+     * 从网络Url中下载文件
+     * @param urlStr
+     * @param fileName
+     * @param savePath
+     * @throws IOException
+     */
+    public static void  downLoadFromUrl(String urlStr,String fileName,String savePath) throws IOException{
+        System.setProperty("javax.net.debug", "all");
+
+        URL url = new URL(urlStr);
+        HttpURLConnection conn = (HttpURLConnection)url.openConnection();
+        conn.setRequestMethod("GET");
+        //设置超时间为3秒
+        conn.setConnectTimeout(3*1000);
+        //防止屏蔽程序抓取而返回403错误
+        conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
+
+        //得到输入流
+        InputStream inputStream = conn.getInputStream();
+        //获取自己数组
+        byte[] getData = readInputStream(inputStream);
+
+        //文件保存位置
+        File saveDir = new File(savePath);
+        if(!saveDir.exists()){
+            saveDir.mkdir();
+        }
+        File file = new File(saveDir+File.separator+fileName);
+        FileOutputStream fos = new FileOutputStream(file);
+        fos.write(getData);
+        if(fos!=null){
+            fos.close();
+        }
+        if(inputStream!=null){
+            inputStream.close();
+        }
+
+
+        System.out.println("info:"+url+" download success");
+
+    }
+
+
+
+    /**
+     * 从输入流中获取字节数组
+     * @param inputStream
+     * @return
+     * @throws IOException
+     */
+    public static  byte[] readInputStream(InputStream inputStream) throws IOException {
+        byte[] buffer = new byte[1024];
+        int len = 0;
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        while((len = inputStream.read(buffer)) != -1) {
+            bos.write(buffer, 0, len);
+        }
+        bos.close();
+        return bos.toByteArray();
+    }
+
+    public static void main(String[] args) {
+        try{
+            downLoadFromUrl("http://img1.mm131.me/pic/4170/15.jpg", "百度.jpg","D:\\webimg");
+        }catch (Exception e) {
+            // TODO: handle exception
+            System.out.println("");
+        }
+    }
+
+}
diff --git a/...c-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java b/...c-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java
@@ -6,24 +6,35 @@
 import us.codecraft.webmagic.processor.PageProcessor;
 
 /**
+ * 这部分我们直接通过GithubRepoPageProcessor这个例子来介绍PageProcessor的编写方式。
+ * 我将PageProcessor的定制分为三个部分，分别是爬虫的配置、页面元素的抽取和链接的发现。
+ *
  * @author code4crafter@gmail.com <br>
  * @since 0.3.2
  */
 public class GithubRepoPageProcessor implements PageProcessor {
+    // 部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数等
 
     private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
 
+    // process是定制爬虫逻辑的核心接口，在这里编写抽取逻辑
     @Override
     public void process(Page page) {
-        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
-        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
+        // 部分二：定义如何抽取页面信息，并保存下来
         page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
         page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
         if (page.getResultItems().get("name")==null){
             //skip this page
             page.setSkip(true);
         }
         page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
+        // 部分三：从页面发现后续的url地址来抓取
+        /**
+         * page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()用于获取所有满足"(https:/ /github\.com/\w+/\w+)"
+         * 这个正则表达式的链接，page.addTargetRequests()则将这些链接加入到待抓取的队列中去。
+         */
+        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
+        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
     }
 
     @Override
@@ -32,6 +43,9 @@ public Site getSite() {
     }
 
     public static void main(String[] args) {
-        Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
+        //https://blog.csdn.net/bbc2005/article/details/80890829
+        //https://www.cnblogs.com/sunny08/p/8038440.html
+       // System.setProperty("javax.net.debug", "all"); //打印网络连接握手信息
+        Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(1).run();
     }
 }
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ImgPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ImgPageProcessor.java
@@ -0,0 +1,77 @@
+package us.codecraft.webmagic.processor.example;
+
+import org.apache.commons.collections.CollectionUtils;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * 这部分我们直接通过GithubRepoPageProcessor这个例子来介绍PageProcessor的编写方式。
+ * 我将PageProcessor的定制分为三个部分，分别是爬虫的配置、页面元素的抽取和链接的发现。
+ *
+ * @author code4crafter@gmail.com <br>
+ * @since 0.3.2
+ */
+public class ImgPageProcessor implements PageProcessor {
+    // 部分一：抓取网站的相关配置，包括编码、抓取间隔、重试次数等
+
+    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
+
+    // process是定制爬虫逻辑的核心接口，在这里编写抽取逻辑
+    @Override
+    public void process(Page page) {
+        // 部分二：定义如何抽取页面信息，并保存下来
+        String imgName = page.getHtml().xpath("//div[@class='content-pic']/a/img//@alt").toString();
+        String imgUrl = page.getHtml().xpath("//div[@class='content-pic']/a/img//@src").toString();
+        String urlpre = "";
+        URL url = null;
+        try {
+            url = new URL(page.getUrl().toString());
+            System.out.println("");
+            urlpre = url.getProtocol()+"://" + url.getHost() +"/"+ url.getPath().split("/")[1];
+        } catch (MalformedURLException e) {
+            e.printStackTrace();
+        }
+        if(null!=imgUrl && ""!= imgUrl && ""!=imgName){
+            try {
+                DownImgUtil.downLoadFromUrl(imgUrl, imgName,"");
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+        }
+
+
+        // 部分三：从页面发现后续的url地址来抓取
+        /**
+         * page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()用于获取所有满足"(https:/ /github\.com/\w+/\w+)"
+         * 这个正则表达式的链接，page.addTargetRequests()则将这些链接加入到待抓取的队列中去。
+         */
+        List<String> urls = page.getHtml().links().xpath("/html/body/div[6]/div[3]//@href").all();
+        List<String> handledUrls = new ArrayList<String>();
+        if(CollectionUtils.isNotEmpty(urls)) {
+            for(String temp : urls) {
+                handledUrls.add(urlpre+temp);
+            }
+        }
+        page.addTargetRequests(handledUrls);
+        }
+
+    @Override
+    public Site getSite() {
+        return site;
+    }
+
+    public static void main(String[] args) {
+        //https://blog.csdn.net/bbc2005/article/details/80890829
+        //https://www.cnblogs.com/sunny08/p/8038440.html
+       // System.setProperty("javax.net.debug", "all"); //打印网络连接握手信息
+        Spider.create(new ImgPageProcessor()).addUrl("http://www.mm131.com/xinggan/4170.html").thread(1).run();
+    }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java
@@ -37,7 +37,7 @@ public static void main(String[] args) {
         //multidownload
         List<String> list = new ArrayList<String>();
         list.add(String.format(urlTemplate,"风力发电"));
-        list.add(String.format(urlTemplate,"太阳能"));
+       // list.add(String.format(urlTemplate,"太阳能"));
         list.add(String.format(urlTemplate,"地热发电"));
         list.add(String.format(urlTemplate,"地热发电"));
         List<BaiduBaike> resultItemses = ooSpider.<BaiduBaike>getAll(list);

diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java
@@ -31,7 +31,7 @@ public static class ModelDateStr {
 
     public static class ModelDate {
 
-        @Formatter(value = "yyyyMMdd", formatter = DateFormatter.class)
+        //@Formatter(value = "yyyyMMdd", formatter = DateFormatter.class)
         @ExtractBy(value = "//div[@class='date']/text()", notNull = true)
         private Date date;
 
@@ -53,23 +53,23 @@ public static class ModelStringList {
 
     public static class ModelIntList {
 
-        @Formatter(subClazz = Integer.class)
+        //@Formatter(subClazz = Integer.class)
         @ExtractBy("//li[@class='numbers']/text()")
         private List<Integer> numbers;
 
     }
 
     public static class ModelDateList {
 
-        @Formatter(subClazz = Date.class, value = "yyyyMMdd")
+        ///@Formatter(subClazz = Date.class, value = "yyyyMMdd")
         @ExtractBy("//li[@class='dates']/text()")
         private List<Date> dates;
 
     }
 
     public static class ModelCustomList {
 
-        @Formatter(subClazz = Date.class, value = "yyyyMMdd",formatter = DateFormatter.class)
+        //@Formatter(subClazz = Date.class, value = "yyyyMMdd",formatter = DateFormatter.class)
         @ExtractBy("//li[@class='dates']/text()")
         private List<Date> dates;