WebMagic爬取网站图片,Java交流,编程语言专区,鱼C论坛

漠漠lun 发表于 2017-3-12 21:38:08

WebMagic爬取网站图片

WebMagic网站http://webmagic.io/
参考https://www.oschina.net/code/snippet_1397325_35514

1.实现PageProcessor
import java.util.ArrayList;
import java.util.List;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.utils.UrlUtils;

public class ImgProcessor implements PageProcessor {

private String urlPattern;

private Site site;

private int key = 0;

public ImgProcessor(){}

public ImgProcessor(String startUrl, String urlPattern) {
   this.site = Site.me().setDomain(UrlUtils.getDomain(startUrl));
   this.urlPattern= urlPattern;
}

@Override
public void process(Page page) {
String imgRegex = "http://mm.howkuai.com/wp-content/uploads/20{2}/{1,4}/{1,4}/{1,4}.jpg";
   List<String> requests = page.getHtml().links().regex(urlPattern).all();
   String imgHostFileName = page.getHtml().xpath("//title/text()").toString().replaceAll("[|\\pP‘’“”\\s(妹子图)]", "");
   List<String> listProcess = page.getHtml().$("div#picture").regex(imgRegex).all();
   //此处将标题一并抓取，之后提取出来作为文件名
   listProcess.add(0, imgHostFileName);
   page.putField("img", listProcess);

   page.addTargetRequests(requests);

}

@Override
public Site getSite() {
   return site;
}

}

2.实现Pipeline
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;

public class ImgPipeline extends FilePersistentBase implements Pipeline {

private Logger logger = LoggerFactory.getLogger(getClass());
public ImgPipeline() {
   setPath("/data/webmagic/");
}

public ImgPipeline(String path) {
   setPath(path);
}

@Override
public void process(ResultItems resultItems, Task task) {
   String fileStorePath = this.path;
   try {

         String imgShortNameNew="(http://mm.howkuai.com/wp-content/uploads/)|(jpg)";
         CloseableHttpClient httpclient = HttpClients.createDefault();
         for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {

            if (entry.getValue() instanceof List) {

               List listOne= (List) entry.getValue();
               List<String> list = new ArrayList<String>();
               for(int i=0;i<listOne.size();i++){
                     list.add((String)listOne.get(i));
               }

               for(int i=0;i<list.size();i++)
               {

                     StringBuffer sb = new StringBuffer();
                     StringBuffer imgFileNameNewYuan =sb.append(fileStorePath)
                           .append(list.get(0)) //此处提取文件夹名，即之前采集的标题名
                           .append("\\");
                     //这里先判断文件夹名是否存在，不存在则建立相应文件夹
                     Path target = Paths.get(imgFileNameNewYuan.toString());
                     if(!Files.isReadable(target)){
                        Files.createDirectory(target);
                     }
                     String extName=com.google.common.io
                                    .Files.getFileExtension(list.get(i));//获取文件后缀
                     StringBuffer imgFileNameNew = imgFileNameNewYuan
                           .append((list.get(i)).replaceAll(imgShortNameNew, "")
                           .replaceAll("[\\pP‘’“”]", ""))
                           .append(".")
                           .append(extName);
                     if(extName == null || extName.equals("")){
                        continue;
                     }


                     //这里通过httpclient下载之前抓取到的图片网址，并放在对应的文件中
                     HttpGet httpget = new HttpGet(list.get(i));
                     HttpResponse response = httpclient.execute(httpget);
                     HttpEntity entity = response.getEntity();
                     InputStream in = entity.getContent();

                     File file = new File(imgFileNameNew.toString());

                     System.out.println("Download:"+imgFileNameNew);
                  if(!file.exists()){
                     try {
                           FileOutputStream fout = new FileOutputStream(file);
                           int l = -1;
                           byte[] tmp = new byte;
                           while ((l = in.read(tmp)) != -1) {
                              fout.write(tmp,0,l);
                           }
                           fout.flush();
                           fout.close();
                        } finally {

                           in.close();
                        }
                  }

               }
            }else {
               System.out.println(entry.getKey() + ":\t" + entry.getValue());
            }
         }
         httpclient.close();
   } catch (IOException e) {
         logger.warn("write file error", e);
   }
}

}

3.爬图
import us.codecraft.webmagic.Spider;

public class ImgSpiderTest {
public static void main(String[] args) {
String fileStorePath = "E:\\webmagic-data\\test";//这里E盘中必须存在webmagic-data文件夹文件夹中必须包含test文件夹否则报错
   String urlPattern = "http://www.meizitu.com//{1,4}.html";
   ImgProcessor imgspider=new ImgProcessor("http://www.meizitu.com/",urlPattern);

   //webmagic采集图片代码演示，相关网站仅做代码测试之用,请勿过量采集
   Spider.create(imgspider)
            .addUrl("http://www.meizitu.com/")
            .addPipeline(new ImgPipeline(fileStorePath))
            .thread(10)    //此处线程数可调节
            .run();
}
}

ImgProcessor中的"http://mm.howkuai.com/wp-content/uploads/20{2}/{1,4}/{1,4}/{1,4}.jpg";可能会变，如果爬不到图片，可查看一下

零度非安全 发表于 2017-3-12 22:28:25

楼主可以！！！{:10_275:}

漠漠lun 发表于 2017-3-13 13:01:31

零度非安全发表于 2017-3-12 22:28
楼主可以！！！

如果不是前几天看见你的正则表达式，我也不会想到WebMagic

qaqar 发表于 2017-8-4 16:32:07

楼主，为啥我只能爬到标题，没有图片内容呢

anzhexuan666 发表于 2022-3-25 09:27:48

哦吼

页: [1]

鱼C论坛's Archiver

WebMagic爬取网站图片