基于Jsoup+MongoDB的全站爬虫的实现
jopen
10年前
基本思路:
1、初始化一个或者多个入口链接为初始状态到链接表
2、爬虫爬取的黑名单和白名单,只有匹配白名单中的某一个且不匹配黑名单中的任何一个的链接才能通过
3、从链接表中取链接并置为下载状态,下载该链接的网页
4、把下载到的网页插入到内容表
5、从获取的网页中解析出链接,根据2中的规则过滤不需要的链接,把需要的链接以初始状态插入到连接表
6、把该链接置为已下载状态
然后循环步骤3、4、5、6,如果步骤3下载失败,则链接处于下载状态,并跳过该链接继续循环
代码实现
黑白名单
package com.data; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; /** * * @author yun * @date 2015年3月25日 * @time 上午11:01:57 * @todo 黑白名单 * */ public class Regex { private List<String> blackList = new ArrayList<String>(); private List<String> whiteList = new ArrayList<String>(); public Regex(String blackPath, String whitePath) { try (FileInputStream fis = new FileInputStream(blackPath); InputStreamReader isr = new InputStreamReader(fis); BufferedReader br = new BufferedReader(isr)) { String line; while ((line = br.readLine()) != null) { line = line.trim(); if (line.length() == 0) { continue; } blackList.add(line); } } catch (Exception e) { System.out.println("读取黑名单出现异常:" + e.getMessage()); } try (FileInputStream fis = new FileInputStream(whitePath); InputStreamReader isr = new InputStreamReader(fis); BufferedReader br = new BufferedReader(isr)) { String line; while ((line = br.readLine()) != null) { line = line.trim(); if (line.length() == 0) { continue; } whiteList.add(line); } } catch (Exception e) { System.out.println("读取黑名单出现异常:" + e.getMessage()); } } public List<String> getBlackList() { return blackList; } public void setBlackList(List<String> blackList) { this.blackList = blackList; } public List<String> getWhiteList() { return whiteList; } public void setWhiteList(List<String> whiteList) { this.whiteList = whiteList; } }
正则匹配
package com.data; /** * * @author yun * @date 2015年3月25日 * @time 上午11:02:01 * @todo 正则匹配 * */ public class Global { public static boolean regex(String url, Regex regex) { for (String black : regex.getBlackList()) { if (!url.matches(black)) { continue; } return false; } for (String white : regex.getWhiteList()) { if (!url.matches(white)) { continue; } return true; } return false; } }
爬虫类
package com.data; import java.io.IOException; import java.net.UnknownHostException; import java.util.Date; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.data.util.Hash; import com.data.util.ZLib; import com.mongodb.BasicDBObject; import com.mongodb.DBCollection; import com.mongodb.DBObject; import com.mongodb.MongoClient; /** * * @author yun * @date 2015年3月25日 * @time 上午10:54:49 * @todo 爬虫 * */ public class Spider { private BasicDBObject update = new BasicDBObject("$set", new BasicDBObject("status", 1)); private BasicDBObject query = new BasicDBObject("status", 0); private MongoClient server; private Regex regex; public static void main(String[] args) { try { new Spider().execute(); } catch (Exception e) { System.out.println(e.getMessage()); } } public void execute() throws InterruptedException { init(); Thread[] threads = new Thread[3]; for (int x = 0; x < threads.length; x++) { threads[x] = new Thread(new Crawl()); threads[x].start(); } for (int x = 0; x < threads.length; x++) { threads[x].join(); } server.close(); System.out.println("end"); } private void init() { try { server = new MongoClient("192.168.16.215"); } catch (UnknownHostException e) { System.out.println(e.getMessage()); return; } loadConfig(); } public synchronized void loadConfig() { String blackPath = "D:/360pan/eclipse/workspace/Spider/bin/black"; String whitePath = "D:/360pan/eclipse/workspace/Spider/bin/white"; regex = new Regex(blackPath, whitePath); } private void analysisUrls(Document doc) { Elements select = doc.select("a[href]"); for (Element link : select) { String url = link.absUrl("href"); if (!Global.regex(url, regex)) { continue; } saveUrl(url); } } private void saveUrl(String url) { if (url.contains("#")) { url = url.substring(0, url.indexOf("#")); } DBCollection collection = server.getDB("db").getCollection("url"); BasicDBObject doc = new BasicDBObject(); doc.append("url", url); doc.append("md5", Hash.getMd5String(url.getBytes())); doc.append("status", 0); doc.append("date", new Date()); try { collection.insert(doc); } catch (Exception e) { return; } } class Crawl implements Runnable { @Override public void run() { DBCollection collection = server.getDB("db").getCollection("url"); while (true) { DBObject find = collection.findAndModify(query, update); if (find == null) { break; } String url = find.get("url").toString(); Connection connect = Jsoup.connect(url).timeout(3000).followRedirects(true); Document doc = null; try { doc = connect.get(); } catch (IOException e) { System.out.println("crawl >> " + url + " >> " + e.getMessage()); continue; } System.out.println("crawl >> " + url); commitUrl(url); analysisUrls(doc); commitContent(doc.html(), url); } } } private void commitUrl(String url) { DBCollection collection = server.getDB("db").getCollection("url"); BasicDBObject query = new BasicDBObject(); query.put("url", url); BasicDBObject update = new BasicDBObject(); BasicDBObject modify = new BasicDBObject(); modify.put("status", 2); update.put("$set", modify); collection.update(query, update, true, true); } private void commitContent(String content, String url) { try { DBCollection collection = server.getDB("db").getCollection("content"); BasicDBObject doc = new BasicDBObject(); doc.append("url", url); doc.append("data", ZLib.compress(content.getBytes("UTF-8"))); doc.append("md5", Hash.getMd5String(url.getBytes())); doc.append("date", new Date()); collection.insert(doc); } catch (Exception e) { return; } } }
黑名单
.*# mailto.* .*.pdf
白名单
http://cjrb.cjn.cn/.*
初始化表和入口
package com.data; import java.net.UnknownHostException; import java.util.Date; import com.data.util.Hash; import com.mongodb.BasicDBObject; import com.mongodb.DBCollection; import com.mongodb.MongoClient; public class Init { public static void initUrlCollection(MongoClient server) { DBCollection collection = server.getDB("db").getCollection("url"); BasicDBObject url_ = new BasicDBObject(); url_.put("url", 1); collection.ensureIndex(url_, "url_", true); BasicDBObject status_ = new BasicDBObject(); status_.put("status", 1); collection.ensureIndex(status_, "status_"); } public static void initContentCollection(MongoClient server) { DBCollection collection = server.getDB("db").getCollection("content"); BasicDBObject url_ = new BasicDBObject(); url_.put("url", 1); collection.ensureIndex(url_, "url_", true); } public static void initEntry(MongoClient server) { //长江日报 String url = "http://cjrb.cjn.cn/html/2015-03/25/node_2.htm"; DBCollection collection = server.getDB("db").getCollection("url"); BasicDBObject entry = new BasicDBObject(); entry.put("url", url); entry.put("status", 0); entry.put("md5", Hash.getMd5String(url.getBytes())); entry.put("date", new Date()); collection.insert(entry); } public static void main(String[] args) throws UnknownHostException { MongoClient server = new MongoClient("192.168.16.215"); initUrlCollection(server); initContentCollection(server); initEntry(server); server.close(); } }
初始化后,运行爬虫类
运行情况如下
其中有尝试爬取pdf,后来在黑名单中添加过滤了pdf就不会爬取pdf了
数据库中数据效果如下