使用lingpipe自然语言处理包进行文本分类

fmms 14年前

TrainTClassifier，基于TF/IDF算法的分类器，必须先把要语料库放到各自所属的分类文件夹中，比如：与金融相关的文章就放到金融这个文件夹中，我这的根目录是f:/data/category，训练完后会生成一个分类器模型 tclassifier，之后其它文本的分类的确定就是通过它。

/**    * 使用 Lingpipe的TF/IDF分类器训练语料    *     * @author laigood    */    public class TrainTClassifier {            //训练语料文件夹        private static File TDIR = new File("f:\\data\\category");        //定义分类        private static String[] CATEGORIES = { "金融", "军事", "医学", "饮食" };            public static void main(String[] args) throws ClassNotFoundException,                IOException {                        TfIdfClassifierTrainer<CharSequence> classifier = new TfIdfClassifierTrainer<CharSequence>(                    new TokenFeatureExtractor(CharacterTokenizerFactory.INSTANCE));                // 开始训练            for (int i = 0; i < CATEGORIES.length; i++) {                File classDir = new File(TDIR, CATEGORIES[i]);                if (!classDir.isDirectory()) {                    System.out.println("不能找到目录=" + classDir);                }                    // 训练器遍历分类文件夹下的所有文件                for (File file : classDir.listFiles()) {                    String text = Files.readFromFile(file, "utf-8");                    System.out.println("正在训练 " + CATEGORIES[i] + file.getName());                    Classification classification = new Classification(                            CATEGORIES[i]);                    Classified<CharSequence> classified = new Classified<CharSequence>(                            text, classification);                    classifier.handle(classified);                }             }                            // 把分类器模型写到文件上            System.out.println("开始生成分类器");            String modelFile = "f:\\data\\category\\tclassifier";            ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(                    modelFile));            classifier.compileTo(os);            os.close();                        System.out.println("分类器生成完成");        }    }

TestTClassifier ,测试分类的准确度，测试数据的存放与上面的类似

public class TestTClassifier {            //测试语料的存放目录        private static File TDIR = new File("f:\\data\\test");        private static String[] CATEGORIES = { "金融", "军事", "医学", "饮食" };            public static void main(String[] args) throws ClassNotFoundException {                        //分类器模型存放地址            String modelFile = "f:\\data\\category\\tclassifier";            ScoredClassifier<CharSequence> compiledClassifier = null;            try {                ObjectInputStream oi = new ObjectInputStream(new FileInputStream(                        modelFile));                compiledClassifier = (ScoredClassifier<CharSequence>) oi                        .readObject();                oi.close();            } catch (IOException ie) {                System.out.println("IO Error: Model file " + modelFile + " missing");            }                // 遍历分类目录中的文件测试分类准确度            ConfusionMatrix confMatrix = new ConfusionMatrix(CATEGORIES);            NumberFormat nf = NumberFormat.getInstance();            nf.setMaximumIntegerDigits(1);            nf.setMaximumFractionDigits(3);            for (int i = 0; i < CATEGORIES.length; ++i) {                File classDir = new File(TDIR, CATEGORIES[i]);                    //对于每一个文件，通过分类器找出最适合的分类                for (File file : classDir.listFiles()) {                    String text = "";                    try {                        text = Files.readFromFile(file, "utf-8");                    } catch (IOException ie) {                        System.out.println("不能读取 " + file.getName());                    }                    System.out.println("测试 " + CATEGORIES[i]                            + File.separator + file.getName());                        ScoredClassification classification = compiledClassifier                            .classify(text.subSequence(0, text.length()));                    confMatrix.increment(CATEGORIES[i],                            classification.bestCategory());                    System.out.println("最适合的分类: "                            + classification.bestCategory());                }             }                 System.out.println("--------------------------------------------");            System.out.println("- 结果 ");            System.out.println("--------------------------------------------");            int[][] imatrix = confMatrix.matrix();            StringBuffer sb = new StringBuffer();            sb.append(StringTools.fillin("CATEGORY", 10, true, ' '));            for (int i = 0; i < CATEGORIES.length; i++)                sb.append(StringTools.fillin(CATEGORIES[i], 8, false, ' '));            System.out.println(sb.toString());                for (int i = 0; i < imatrix.length; i++) {                sb = new StringBuffer();                sb.append(StringTools.fillin(CATEGORIES[i], 10, true, ' ',                        10 - CATEGORIES[i].length()));                for (int j = 0; j < imatrix.length; j++) {                    String out = "" + imatrix[i][j];                    sb.append(StringTools.fillin(out, 8, false, ' ',                            8 - out.length()));                }                System.out.println(sb.toString());            }                System.out.println("准确度: "                    + nf.format(confMatrix.totalAccuracy()));            System.out.println("总共正确数 : " + confMatrix.totalCorrect());            System.out.println("总数：" + confMatrix.totalCount());        }    }

补上StringTools

/**   * A class containing a bunch of string utilities - <br>   * a. filterChars: Remove extraneous characters from a string and return a   * "clean" string. <br>   * b. getSuffix: Given a file name return its extension. <br>   * c. fillin: pad or truncate a string to a fixed number of characters. <br>   * d. removeAmpersandStrings: remove strings that start with ampersand <br>   * e. shaDigest: Compute the 40 byte digest signature of a string <br>   */  public class StringTools {    public static final Locale LOCALE = new Locale("en");    // * -- String limit for StringTools    private static int STRING_TOOLS_LIMIT = 1000000;    // *-- pre-compiled RE patterns    private static Pattern extPattern = Pattern.compile("^.*[.](.*?){1}quot;);    private static Pattern spacesPattern = Pattern.compile("\\s+");    private static Pattern removeAmpersandPattern = Pattern.compile("&[^;]*?;");      /**     * Removes non-printable spaces and replaces with a single space     *      * @param in     *          String with mixed characters     * @return String with collapsed spaces and printable characters     */    public static String filterChars(String in) {      return (filterChars(in, "", ' ', true));    }      public static String filterChars(String in, boolean newLine) {      return (filterChars(in, "", ' ', newLine));    }      public static String filterChars(String in, String badChars) {      return (filterChars(in, badChars, ' ', true));    }      public static String filterChars(String in, char replaceChar) {      return (filterChars(in, "", replaceChar, true));    }      public static String filterChars(String in, String badChars,        char replaceChar, boolean newLine) {      if (in == null)        return "";      int inLen = in.length();      if (inLen > STRING_TOOLS_LIMIT)        return in;      try {        // **-- replace non-recognizable characters with spaces        StringBuffer out = new StringBuffer();        int badLen = badChars.length();        for (int i = 0; i < inLen; i++) {          char ch = in.charAt(i);          if ((badLen != 0) && removeChar(ch, badChars)) {            ch = replaceChar;          } else if (!Character.isDefined(ch) && !Character.isSpaceChar(ch)) {            ch = replaceChar;          }          out.append(ch);        }          // *-- replace new lines with space        Matcher matcher = null;        in = out.toString();          // *-- replace consecutive spaces with single space and remove        // leading/trailing spaces        in = in.trim();        matcher = spacesPattern.matcher(in);        in = matcher.replaceAll(" ");      } catch (OutOfMemoryError e) {        return in;      }        return in;    }      // *-- remove any chars found in the badChars string    private static boolean removeChar(char ch, String badChars) {      if (badChars.length() == 0)        return false;      for (int i = 0; i < badChars.length(); i++) {        if (ch == badChars.charAt(i))          return true;      }      return false;    }      /**     * Return the extension of a file, if possible.     *      * @param filename     * @return string     */    public static String getSuffix(String filename) {      if (filename.length() > STRING_TOOLS_LIMIT)        return ("");      Matcher matcher = extPattern.matcher(filename);      if (!matcher.matches())        return "";      return (matcher.group(1).toLowerCase(LOCALE));    }      public static String fillin(String in, int len) {      return fillin(in, len, true, ' ', 3);    }      public static String fillin(String in, int len, char fillinChar) {      return fillin(in, len, true, fillinChar, 3);    }      public static String fillin(String in, int len, boolean right) {      return fillin(in, len, right, ' ', 3);    }      public static String fillin(String in, int len, boolean right, char fillinChar) {      return fillin(in, len, right, fillinChar, 3);    }      /**     * Return a string concatenated or padded to the specified length     *      * @param in     *          string to be truncated or padded     * @param len     *          int length for string     * @param right     *          boolean fillin from the left or right     * @param fillinChar     *          char to pad the string     * @param numFills     *          int number of characters to pad     * @return String of specified length     */    public static String fillin(String in, int len, boolean right,        char fillinChar, int numFills) {      // *-- return if string is of required length      int slen = in.length();      if ((slen == len) || (slen > STRING_TOOLS_LIMIT))        return (in);        // *-- build the fillin string      StringBuffer fillinStb = new StringBuffer();      for (int i = 0; i < numFills; i++)        fillinStb.append(fillinChar);      String fillinString = fillinStb.toString();        // *-- truncate and pad string if length exceeds required length      if (slen > len) {        if (right)          return (in.substring(0, len - numFills) + fillinString);        else          return (fillinString + in.substring(slen - len + numFills, slen));      }        // *-- pad string if length is less than required length DatabaseEntry      // dbe = dbt.getNextKey(); String dbkey = new String (dbe.getData());      StringBuffer sb = new StringBuffer();      if (right)        sb.append(in);      sb.append(fillinString);      if (!right)        sb.append(in);      return (sb.toString());    }      /**     * Remove ampersand strings such as \&nbsp;     *      * @param in     *          Text string extracted from Web pages     * @return String Text string without ampersand strings     */    public static String removeAmpersandStrings(String in) {      if (in.length() > STRING_TOOLS_LIMIT)        return (in);      Matcher matcher = removeAmpersandPattern.matcher(in);      return (matcher.replaceAll(""));    }      /**     * Escape back slashes     *      * @param in     *          Text to be escaped     * @return String Escaped test     */    public static String escapeText(String in) {      StringBuffer sb = new StringBuffer();      for (int i = 0; i < in.length(); i++) {        char ch = in.charAt(i);        if (ch == '\\')          sb.append("\\\\");        else          sb.append(ch);      }      return (sb.toString());    }      /**     * Get the SHA signature of a string     *      * @param in     *          String     * @return String SHA signature of in     */    public static String shaDigest(String in) {      StringBuffer out = new StringBuffer();      if ((in == null) || (in.length() == 0))        return ("");      try {        // *-- create a message digest instance and compute the hash        // byte array        MessageDigest md = MessageDigest.getInstance("SHA-1");        md.reset();        md.update(in.getBytes());        byte[] hash = md.digest();          // *--- Convert the hash byte array to hexadecimal format, pad        // hex chars with leading zeroes        // *--- to get a signature of consistent length (40) for all        // strings.        for (int i = 0; i < hash.length; i++) {          out.append(fillin(Integer.toString(0xFF & hash[i], 16), 2, false, '0',              1));        }      } catch (OutOfMemoryError e) {        return ("<-------------OUT_OF_MEMORY------------>");      } catch (NoSuchAlgorithmException e) {        return ("<------SHA digest algorithm not found--->");      }        return (out.toString());    }      /**     * Return the string with the first letter upper cased     *      * @param in     * @return String     */    public static String firstLetterUC(String in) {      if ((in == null) || (in.length() == 0))        return ("");      String out = in.toLowerCase(LOCALE);      String part1 = out.substring(0, 1);      String part2 = out.substring(1, in.length());      return (part1.toUpperCase(LOCALE) + part2.toLowerCase(LOCALE));    }      /**     * Return a pattern that can be used to collapse consecutive patterns of the     * same type     *      * @param entityTypes     *          A list of entity types     * @return Regex pattern for the entity types     */    public static Pattern getCollapsePattern(String[] entityTypes) {      Pattern collapsePattern = null;      StringBuffer collapseStr = new StringBuffer();      for (int i = 0; i < entityTypes.length; i++) {        collapseStr.append("(<\\/");        collapseStr.append(entityTypes[i]);        collapseStr.append(">\\s+");        collapseStr.append("<");        collapseStr.append(entityTypes[i]);        collapseStr.append(">)|");      }      collapsePattern = Pattern.compile(collapseStr.toString().substring(0,          collapseStr.length() - 1));      return (collapsePattern);    }      /**     * return a double that indicates the degree of similarity between two strings     * Use the Jaccard similarity, i.e. the ratio of A intersection B to A union B     *      * @param first     *          string     * @param second     *          string     * @return double degreee of similarity     */    public static double stringSimilarity(String first, String second) {      if ((first == null) || (second == null))        return (0.0);      String[] a = first.split("\\s+");      String[] b = second.split("\\s+");        // *-- compute a union b      HashSet<String> aUnionb = new HashSet<String>();      HashSet<String> aTokens = new HashSet<String>();      HashSet<String> bTokens = new HashSet<String>();      for (int i = 0; i < a.length; i++) {        aUnionb.add(a[i]);        aTokens.add(a[i]);      }      for (int i = 0; i < b.length; i++) {        aUnionb.add(b[i]);        bTokens.add(b[i]);      }      int sizeAunionB = aUnionb.size();        // *-- compute a intersect b      Iterator <String> iter = aUnionb.iterator();      int sizeAinterB = 0;      while (iter != null && iter.hasNext()) {        String token = (String) iter.next();        if (aTokens.contains(token) && bTokens.contains(token))          sizeAinterB++;      }      return ((sizeAunionB > 0) ? (sizeAinterB + 0.0) / sizeAunionB : 0.0);    }      /**     * Return the edit distance between the two strings     *      * @param s1     * @param s2     * @return double     */    public static double editDistance(String s1, String s2) {      if ((s1.length() == 0) || (s2.length() == 0))        return (0.0);      return EditDistance.editDistance(s1.subSequence(0, s1.length()), s2          .subSequence(0, s2.length()), false);    }      /**     * Return a string with the contents from the passed reader     *      * @param r Reader     * @return String     */    public static String readerToString(Reader r) {      int charValue;      StringBuffer sb = new StringBuffer(1024);      try {        while ((charValue = r.read()) != -1)          sb.append((char) charValue);      } catch (IOException ie) {        sb.setLength(0);      }      return (sb.toString());    }      /**     * Clean up a sentence by consecutive non-alphanumeric chars with a single     * non-alphanumeric char     *      * @param in Array of chars     * @return String     */    public static String cleanString(char[] in) {      int len = in.length;      boolean prevOK = true;      for (int i = 0; i < len; i++) {        if (Character.isLetterOrDigit(in[i]) || Character.isWhitespace(in[i]))          prevOK = true;        else {          if (!prevOK)            in[i] = ' ';          prevOK = false;        }      }      return (new String(in));    }      /**     * Return a clean file name     *      * @param filename     * @return String     */    public static String parseFile(String filename) {      return (filterChars(filename, "\\/_:."));    }  }

转自：http://blog.csdn.net/laigood12345/article/details/6680201

使用lingpipe自然语言处理包进行文本分类

相关经验

目录