MapReduce的数据流程、执行流程

jopen 12年前

MapReduce的数据流程：

预先加载本地的输入文件
经过MAP处理产生中间结果
经过shuffle程序将相同key的中间结果分发到同一节点上处理
Recude处理产生结果输出
将结果输出保存在hdfs上

MapReduce的数据流程、执行流程

MAP

在map阶段，使用job.setInputFormatClass定义的InputFormat将输入的数据集分割成小数据块splites，
同时InputFormat提供一个RecordReder的实现。默认的是TextInputFormat，
他提供的RecordReder会将文本的一行的偏移量作为key，这一行的文本作为value。
这就是自定义Map的输入是的原因。
然后调用自定义Map的map方法，将一个个对输入给Map的map方法。

最终是按照自定义的MAP的输出key类，输出class类生成一个List。

Partitioner

在map阶段的最后，会先调用job.setPartitionerClass设置的类对这个List进行分区，
每个分区映射到一个reducer。每个分区内又调用job.setSortComparatorClass设置的key比较函数类排序。

可以看到，这本身就是一个二次排序。
如果没有通过job.setSortComparatorClass设置key比较函数类，则使用key的实现的compareTo方法。

Shuffle：

将每个分区根据一定的规则，分发到reducer处理

Sort

在reduce阶段，reducer接收到所有映射到这个reducer的map输出后，
也是会调用job.setSortComparatorClass设置的key比较函数类对所有数据对排序。
然后开始构造一个key对应的value迭代器。这时就要用到分组，
使用jobjob.setGroupingComparatorClass设置的分组函数类。只要这个比较器比较的两个key相同，
他们就属于同一个组，它们的value放在一个value迭代器

Reduce
最后就是进入Reducer的reduce方法，reduce方法的输入是所有的（key和它的value迭代器）。
同样注意输入与输出的类型必须与自定义的Reducer中声明的一致。

MapReduce的数据流程、执行流程

具体的例子：

是hadoop mapreduce example中的例子，自己改写了一下并加入的注释

import java.io.DataInput;  import java.io.DataOutput;  import java.io.IOException;  import java.util.StringTokenizer;    import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.fs.Path;  import org.apache.hadoop.io.IntWritable;  import org.apache.hadoop.io.LongWritable;  import org.apache.hadoop.io.RawComparator;  import org.apache.hadoop.io.Text;  import org.apache.hadoop.io.WritableComparable;  import org.apache.hadoop.io.WritableComparator;  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  import org.apache.hadoop.mapreduce.Job;  import org.apache.hadoop.mapreduce.Mapper;  import org.apache.hadoop.mapreduce.Partitioner;  import org.apache.hadoop.mapreduce.Reducer;  import org.apache.hadoop.util.GenericOptionsParser;    import com.catt.cdh.mr.example.SecondarySort2.FirstPartitioner;  import com.catt.cdh.mr.example.SecondarySort2.Reduce;    /**   * This is an example Hadoop Map/Reduce application.   * It reads the text input files that must contain two integers per a line.   * The output is sorted by the first and second number and grouped on the   * first number.   *    * To run: bin/hadoop jar build/hadoop-examples.jar secondarysort   * in-dir out-dir  */  public class SecondarySort {     /**    * Define a pair of integers that are writable.    * They are serialized in a byte comparable format.    */   public static class IntPair implements WritableComparable {    private int first = 0;    private int second = 0;      /**     * Set the left and right values.     */    public void set(int left, int right) {     first = left;     second = right;    }      public int getFirst() {     return first;    }      public int getSecond() {     return second;    }      /**     * Read the two integers.     * Encoded as: MIN_VALUE -> 0, 0 -> -MIN_VALUE, MAX_VALUE-> -1     */    @Override    public void readFields(DataInput in) throws IOException {     first = in.readInt() + Integer.MIN_VALUE;     second = in.readInt() + Integer.MIN_VALUE;    }      @Override    public void write(DataOutput out) throws IOException {     out.writeInt(first - Integer.MIN_VALUE);     out.writeInt(second - Integer.MIN_VALUE);    }      @Override    // The hashCode() method is used by the HashPartitioner (the default    // partitioner in MapReduce)    public int hashCode() {     return first * 157 + second;    }      @Override    public boolean equals(Object right) {     if (right instanceof IntPair) {      IntPair r = (IntPair) right;      return r.first == first && r.second == second;     } else {      return false;     }    }      /** A Comparator that compares serialized IntPair. */    public static class Comparator extends WritableComparator {     public Comparator() {      super(IntPair.class);     }       // 针对key进行比较，调用多次     public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2,       int l2) {      return compareBytes(b1, s1, l1, b2, s2, l2);     }    }      static {     // 注意：如果不进行注册，则使用key.compareTo方法进行key的比较     // register this comparator     WritableComparator.define(IntPair.class, new Comparator());    }      // 如果不注册WritableComparator，则使用此方法进行key的比较    @Override    public int compareTo(IntPair o) {     if (first != o.first) {      return first < o.first ? -1 : 1;     } else if (second != o.second) {      return second < o.second ? -1 : 1;     } else {      return 0;     }    }   }     /**    * Partition based on the first part of the pair.    */   public static class FirstPartitioner extends     Partitioner {    @Override    public int getPartition(IntPair key, IntWritable value,      int numPartitions) {     return Math.abs(key.getFirst() * 127) % numPartitions;    }   }     /**    * Compare only the first part of the pair, so that reduce is called once    * for each value of the first part.    */   public static class FirstGroupingComparator implements     RawComparator {      // 针对key调用，调用多次    @Override    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {     return WritableComparator.compareBytes(b1, s1, Integer.SIZE / 8,       b2, s2, Integer.SIZE / 8);    }      // 没有监控到被调用，不知道有什么用    @Override    public int compare(IntPair o1, IntPair o2) {     int l = o1.getFirst();     int r = o2.getFirst();     return l == r ? 0 : (l < r ? -1 : 1);    }   }     /**    * Read two integers from each line and generate a key, value pair    * as ((left, right), right).    */   public static class MapClass extends     Mapper {      private final IntPair key = new IntPair();    private final IntWritable value = new IntWritable();      @Override    public void map(LongWritable inKey, Text inValue, Context context)      throws IOException, InterruptedException {     StringTokenizer itr = new StringTokenizer(inValue.toString());     int left = 0;     int right = 0;     if (itr.hasMoreTokens()) {      left = Integer.parseInt(itr.nextToken());      if (itr.hasMoreTokens()) {       right = Integer.parseInt(itr.nextToken());      }      key.set(left, right);      value.set(right);      context.write(key, value);     }    }   }     /**    * A reducer class that just emits the sum of the input values.    */   public static class Reduce extends     Reducer {    private static final Text SEPARATOR = new Text(      "------------------------------------------------");    private final Text first = new Text();      @Override    public void reduce(IntPair key, Iterable values,      Context context) throws IOException, InterruptedException {     context.write(SEPARATOR, null);     first.set(Integer.toString(key.getFirst()));     for (IntWritable value : values) {      context.write(first, value);     }    }   }     public static void main(String[] args) throws Exception {    Configuration conf = new Configuration();    String[] ars = new String[] { "hdfs://data2.kt:8020/test/input",      "hdfs://data2.kt:8020/test/output" };    conf.set("fs.default.name", "hdfs://data2.kt:8020/");      String[] otherArgs = new GenericOptionsParser(conf, ars)      .getRemainingArgs();    if (otherArgs.length != 2) {     System.err.println("Usage: secondarysort  ");     System.exit(2);    }    Job job = new Job(conf, "secondary sort");    job.setJarByClass(SecondarySort.class);    job.setMapperClass(MapClass.class);      // 不再需要Combiner类型，因为Combiner的输出类型对Reduce的输入类型不适用    // job.setCombinerClass(Reduce.class);    // Reducer类型    job.setReducerClass(Reduce.class);    // 分区函数    job.setPartitionerClass(FirstPartitioner.class);    // 设置setSortComparatorClass，在partition后，    // 每个分区内又调用job.setSortComparatorClass设置的key比较函数类排序    // 另外，在reducer接收到所有映射到这个reducer的map输出后，    // 也是会调用job.setSortComparatorClass设置的key比较函数类对所有数据对排序    // job.setSortComparatorClass(GroupingComparator2.class);    // 分组函数      job.setGroupingComparatorClass(FirstGroupingComparator.class);      // the map output is IntPair, IntWritable    // 针对自定义的类型，需要指定MapOutputKeyClass    job.setMapOutputKeyClass(IntPair.class);    // job.setMapOutputValueClass(IntWritable.class);      // the reduce output is Text, IntWritable    job.setOutputKeyClass(Text.class);    job.setOutputValueClass(IntWritable.class);      FileInputFormat.addInputPath(job, new Path(otherArgs[0]));    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));    System.exit(job.waitForCompletion(true) ? 0 : 1);   }    }

MapReduce的数据流程、执行流程

相关经验

目录