基于模糊K-Means算法的新闻聚类

MamieH92 9年前

来自: http://blog.csdn.net/u012965373/article/details/50754449


<strong>/***   * @author YangXin   * @info 基于模糊K-Means算法的新闻聚类   */  package unitNine;  import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.fs.FileSystem;  import org.apache.hadoop.fs.Path;  import org.apache.hadoop.io.IntWritable;  import org.apache.hadoop.io.SequenceFile;  import org.apache.lucene.analysis.Analyzer;  import org.apache.mahout.clustering.Cluster;  import org.apache.mahout.clustering.classify.WeightedVectorWritable;  import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;  import org.apache.mahout.common.HadoopUtil;  import org.apache.mahout.common.distance.ManhattanDistanceMeasure;  import org.apache.mahout.common.distance.TanimotoDistanceMeasure;  import org.apache.mahout.vectorizer.DictionaryVectorizer;  import org.apache.mahout.vectorizer.DocumentProcessor;  import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;  public class NewsFuzzyKMeansClustering {   public static void main(String args[]) throws Exception {              int minSupport = 5;       int minDf = 10;       int maxDFPercent = 70;       int maxNGramSize = 1;       int minLLRValue = 200;       int reduceTasks = 1;       int chunkSize = 200;       int norm = 2;       boolean sequentialAccessOutput = true;              String inputDir = "inputDir";              Configuration conf = new Configuration();       FileSystem fs = FileSystem.get(conf);       /*       SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,           new Path(inputDir, "documents.seq"), Text.class, Text.class);       for (Document d : Database) {         writer.append(new Text(d.getID()), new Text(d.contents()));       }       writer.close();*/               String outputDir = "newsClusters";       HadoopUtil.delete(conf, new Path(outputDir));              Path tokenizedPath = new Path(outputDir,           DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);       MyAnalyzer analyzer = new MyAnalyzer();       DocumentProcessor.tokenizeDocuments(new Path(inputDir), analyzer.getClass()           .asSubclass(Analyzer.class), tokenizedPath, conf);              DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,         new Path(outputDir), conf, minSupport, maxNGramSize, minLLRValue, 2, true, reduceTasks,         chunkSize, sequentialAccessOutput, false);       TFIDFConverter.processTfIdf(         new Path(outputDir , DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),         new Path(outputDir), conf, chunkSize, minDf,         maxDFPercent, norm, true, sequentialAccessOutput, false, reduceTasks);       String vectorsFolder = outputDir + "/tfidf-vectors";       String canopyCentroids = outputDir + "/canopy-centroids";       String clusterOutput = outputDir + "/clusters/";              CanopyDriver.run(conf, new Path(vectorsFolder), new Path(canopyCentroids),         new ManhattanDistanceMeasure(), 3000.0, 2000.0, false, false);              FuzzyKMeansDriver.run(conf, new Path(vectorsFolder), new Path(canopyCentroids, "clusters-0"), new Path(clusterOutput),         new TanimotoDistanceMeasure(), 0.01, 20, 2.0f, true, true, 0.0, false);              SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(         clusterOutput + Cluster.CLUSTERED_POINTS_DIR +"/part-m-00000"), conf);              IntWritable key = new IntWritable();       WeightedVectorWritable value = new WeightedVectorWritable();       while (reader.next(key, value)) {         System.out.println("Cluster: " + key.toString() + " "                            + value.getVector().asFormatString());       }       reader.close();     }  }  </strong>