基于Java的web爬虫,Arachnid
jopen
12年前
Arachnid是一个基于Java的web spider框架.它包含一个简单的HTML剖析器能够分析包含HTML内容的输入流.通过实现Arachnid的子类就能够开发一个简单的Web spiders并能够在Web站上的每个页面被解析之后增加几行代码调用。 Arachnid的下载包中包含两个spider应用程序例子用于演示如何使用该框架。
import java.io.*; import java.net.*; import java.util.*; import bplatt.spider.*; public class SimpleSiteMapGen { private String site; private final static String header = "<html><head><title>Site Map</title></head><body><ul>"; private final static String trailer = "</ul></body></html>"; public static void main(String[] args) { if (args.length != 1) { System.err.println("java SimpleSiteMapGen <url>"); System.exit(-1); } SimpleSiteMapGen s = new SimpleSiteMapGen(args[0]); s.generate(); } public SimpleSiteMapGen(String site) { this.site = site; } public void generate() { MySpider spider = null; try { spider = new MySpider(site); } catch(MalformedURLException e) { System.err.println(e); System.err.println("Invalid URL: "+site); return; } System.out.println(header); spider.traverse(); System.out.println(trailer); } } class MySpider extends Arachnid { public MySpider(String base) throws MalformedURLException { super(base); } protected void handleLink(PageInfo p) { String link = p.getUrl().toString(); String title = p.getTitle(); if (link == null || title == null || link.length() == 0 || title.length() ==0) return; else System.out.println("<li><a href=\""+link+"\">"+title+"</a></li>"); } protected void handleBadLink(URL url,URL parent, PageInfo p) { } protected void handleBadIO(URL url, URL parent) { } protected void handleNonHTMLlink(URL url, URL parent,PageInfo p) { } protected void handleExternalLink(URL url, URL parent) { } }