Java多线程Web爬虫 Crawler4j
jopen
12年前
Crawler4j是一个开源的Java Web爬虫,提供一个用于抓取Web页面的简单接口。您可以在5分钟内建立一个多线程的网络爬虫!
示例代码:
import java.util.ArrayList; import java.util.regex.Pattern; import edu.uci.ics.crawler4j.crawler.Page; import edu.uci.ics.crawler4j.crawler.WebCrawler; import edu.uci.ics.crawler4j.url.WebURL; public class MyCrawler extends WebCrawler { Pattern filters = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g" + "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$"); public MyCrawler() { } public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); if (filters.matcher(href).matches()) { return false; } if (href.startsWith("http://www.ics.uci.edu/")) { return true; } return false; } public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); String text = page.getText(); ArrayList<WebURL> links = page.getURLs(); } }