基于Java的web爬虫,Arachnid

jopen 12年前

Arachnid是一个基于Java的web spider框架.它包含一个简单的HTML剖析器能够分析包含HTML内容的输入流.通过实现Arachnid的子类就能够开发一个简单的Web spiders并能够在Web站上的每个页面被解析之后增加几行代码调用。 Arachnid的下载包中包含两个spider应用程序例子用于演示如何使用该框架。

import java.io.*;  import java.net.*;  import java.util.*;  import bplatt.spider.*;    public class SimpleSiteMapGen {    private String site;    private final static String header = "<html><head><title>Site Map</title></head><body><ul>";    private final static String trailer = "</ul></body></html>";         public static void main(String[] args) {      if (args.length != 1) {        System.err.println("java SimpleSiteMapGen <url>");        System.exit(-1);      }      SimpleSiteMapGen s = new SimpleSiteMapGen(args[0]);      s.generate();    }        public SimpleSiteMapGen(String site) { this.site = site; }        public void generate() {      MySpider spider = null;      try { spider = new MySpider(site); }      catch(MalformedURLException e) {        System.err.println(e);        System.err.println("Invalid URL: "+site);        return;      }      System.out.println(header);      spider.traverse();      System.out.println(trailer);    }  }    class MySpider extends Arachnid {    public MySpider(String base) throws MalformedURLException { super(base); }        protected void handleLink(PageInfo p) {      String link = p.getUrl().toString();      String title = p.getTitle();      if (link == null || title == null || link.length() == 0 || title.length() ==0) return;      else System.out.println("<li><a href=\""+link+"\">"+title+"</a></li>");    }    protected void handleBadLink(URL url,URL parent, PageInfo p) { }    protected void handleBadIO(URL url, URL parent) { }    protected void handleNonHTMLlink(URL url, URL parent,PageInfo p) { }    protected void handleExternalLink(URL url, URL parent) { }  }

项目主页:http://www.open-open.com/lib/view/home/1349860318900