利用归并排序算法对大文件进行排序

jopen 10年前

原文  http://ivarchen.iteye.com/blog/2179500

归并排序算法介绍,请参照Wikipeida

zh.wikipedia.org/wiki/%E5%BD%92%E5%B9%B6%E6%8E%92%E5%BA%8F

基本思想:

大文件分割成行数相等的两个小文件,使用递归一直分割到所有所有小文件低于限制行数

小文件直接排序

两个排序好的小文件归并到大文件

直到最后所有排序好的文件归并到输入的大文件并返回

之前看了网上很多示例代码,写的很不简洁, 引入了过多的临时变量i, j, k等等, 导致程序基本没法看,

只好自己写了一个,没有很关心执行效率, 只求够用,以后有机会再优化一下吧。

JDK要求Java 8

package com.java.sort.merge;  import com.google.common.base.Charsets;  import com.google.common.base.Strings;  import com.google.common.collect.ImmutableList;  import com.google.common.collect.Iterators;  import com.google.common.collect.PeekingIterator;  import com.google.common.io.Files;  import org.apache.commons.io.FileUtils;  import org.apache.commons.io.IOUtils;  import org.apache.commons.io.LineIterator;  import org.apache.commons.io.filefilter.AndFileFilter;  import org.apache.commons.io.filefilter.PrefixFileFilter;  import org.apache.commons.io.filefilter.SuffixFileFilter;  import org.junit.AfterClass;  import org.junit.BeforeClass;  import org.junit.Test;  import java.io.File;  import java.io.FilenameFilter;  import java.io.IOException;  import java.io.Writer;  import java.util.ArrayList;  import java.util.Collections;  import java.util.List;  public class FileMergeSort {    private static int limit = 9999;    private static void cleanTempFiles() {      FilenameFilter filter = new AndFileFilter(ImmutableList.of(new PrefixFileFilter("sort"), new SuffixFileFilter(".part")));      ImmutableList.copyOf(FileUtils.getTempDirectory().listFiles(filter)).forEach(File::delete);    }    private static int lineNumber(File input) throws IOException {      int count = 0;      LineIterator iterator = FileUtils.lineIterator(input);      while (iterator.hasNext()) {        iterator.next();        count++;      }      return count;    }    private static File split(File input, int from, int to) throws IOException {      File part = File.createTempFile("sort", ".part");      Long lineNumber = 0L;      String line = null;      List<String> lines = new ArrayList<>(to - from);      LineIterator iterator = FileUtils.lineIterator(input);      while (iterator.hasNext()) {        if (lineNumber > to) break;        line = iterator.next();        if (lineNumber >= from && lineNumber <= to) {          lines.add(line);        }        lineNumber++;      }      FileUtils.writeLines(part, lines);      return part;    }    private static File merge(File source, File left, File right) throws IOException {      PeekingIterator<String> leftLineIterator = Iterators.peekingIterator(FileUtils.lineIterator(left));      PeekingIterator<String> rightLineIterator = Iterators.peekingIterator(FileUtils.lineIterator(right));      String leftLine, rightLine;      try (Writer writer = Files.newWriter(source, Charsets.UTF_8)) {        writer.write("");        while (leftLineIterator.hasNext() && rightLineIterator.hasNext()) {          leftLine = leftLineIterator.peek();          rightLine = rightLineIterator.peek();          if (leftLine.compareTo(rightLine) < 0) {            writer.append(leftLine.concat(IOUtils.LINE_SEPARATOR));            leftLineIterator.next();          } else {            writer.append(rightLine.concat(IOUtils.LINE_SEPARATOR));            rightLineIterator.next();          }        }        while (leftLineIterator.hasNext()) {          writer.append(leftLineIterator.next().concat(IOUtils.LINE_SEPARATOR));        }        while (rightLineIterator.hasNext()) {          writer.append(rightLineIterator.next().concat(IOUtils.LINE_SEPARATOR));        }      }      return source;    }    private static File directSort(File input) throws IOException {      List<String> list = new ArrayList<>(limit);      FileUtils.lineIterator(input).forEachRemaining(list::add);      Collections.sort(list);      FileUtils.writeLines(input, list);      return input;    }    public static File mergeSort(File input) throws IOException {      int total = lineNumber(input);      if (total <= limit) {        return directSort(input);      }      int half = total / 2;      File left = mergeSort(split(input, 0, half));      File right = mergeSort(split(input, half + 1, total));      return merge(input, left, right);    }    @BeforeClass    public static void init() throws IOException {      cleanTempFiles();      try (Writer writer = Files.newWriter(new File("long.txt"), Charsets.UTF_8)) {        writer.write("");        for (long i = 99999L; i > 0L; i--) {          writer.append(Strings.padStart(String.valueOf(i), 5, '0').concat(IOUtils.LINE_SEPARATOR));        }      }    }    @AfterClass    public static void clean() {      cleanTempFiles();    }    @Test    public void testSort() throws IOException {      File sorted = mergeSort(new File("long.txt"));    }  }

<?xml version="1.0" encoding="UTF-8"?>  <project xmlns="http://maven.apache.org/POM/4.0.0"       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"       xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">    <modelVersion>4.0.0</modelVersion>    <groupId>com.java.app</groupId>    <artifactId>sample</artifactId>    <version>1.0-SNAPSHOT</version>    <dependencies>      <dependency>        <groupId>commons-io</groupId>        <artifactId>commons-io</artifactId>        <version>2.4</version>      </dependency>      <dependency>        <groupId>org.apache.commons</groupId>        <artifactId>commons-csv</artifactId>        <version>1.1</version>      </dependency>      <dependency>        <groupId>com.google.guava</groupId>        <artifactId>guava</artifactId>        <version>18.0</version>      </dependency>      <dependency>        <groupId>javax.servlet</groupId>        <artifactId>javax.servlet-api</artifactId>        <version>3.1.0</version>      </dependency>      <dependency>        <groupId>org.apache.logging.log4j</groupId>        <artifactId>log4j-api</artifactId>        <version>2.1</version>      </dependency>      <dependency>        <groupId>org.apache.logging.log4j</groupId>        <artifactId>log4j-core</artifactId>        <version>2.1</version>      </dependency>      <dependency>        <groupId>org.apache.logging.log4j</groupId>        <artifactId>log4j-jcl</artifactId>        <version>2.1</version>      </dependency>      <dependency>        <groupId>junit</groupId>        <artifactId>junit</artifactId>        <version>4.12</version>        <scope>test</scope>      </dependency>    </dependencies>  </project>