mapreduce实现"浏览该商品的人大多数还浏览了"经典应用

jopen 11年前

输入:

日期    ...cookie id.        ...商品id..

xx            xx                        xx

输出:

商品id         商品id列表(按优先级排序,用逗号分隔)

xx                   xx

比如:

id1              id3,id0,id4,id2

id2             id0,id5

整个计算过程分为4步

1、提取原始日志日期,cookie id,商品id信息,按天计算,最后输出数据格式

商品id-0 商品id-1

xx           x x         

这一步做了次优化,商品id-0一定比商品id-1小,为了减少存储,在最后汇总数据转置下即可

reduce做局部排序及排重

 

2、基于上次的结果做汇总,按天计算

商品id-0 商品id-1  关联值(关联值即同时访问这两个商品的用户数)

xx             x x                xx

 

3、汇总最近三个月数据,同时考虑时间衰减,时间越久关联值的贡献越低,最后输出两两商品的关联值(包括转置后)

 

4、行列转换,生成最后要的推荐结果数据,按关联值排序生成

 

第一个MR

import java.io.IOException;  import java.util.ArrayList;  import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.fs.FileSystem;  import org.apache.hadoop.fs.Path;  import org.apache.hadoop.io.LongWritable;  import org.apache.hadoop.io.Text;  import org.apache.hadoop.io.WritableComparable;  import org.apache.hadoop.io.WritableComparator;  import org.apache.hadoop.mapreduce.Job;  import org.apache.hadoop.mapreduce.Mapper;  import org.apache.hadoop.mapreduce.Partitioner;  import org.apache.hadoop.mapreduce.Reducer;  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  import org.apache.hadoop.util.GenericOptionsParser;  import org.apache.log4j.Logger;      /*   * 输入:原始数据,会有重复   *日期 cookie 楼盘id   *    * 输出:   * 日期 楼盘id1 楼盘id2  //楼盘id1一定小于楼盘id2 ,按日期 cookie进行分组   *    */    public class HouseMergeAndSplit {      public static class Partitioner1 extends Partitioner<TextPair, Text> {      @Override      public int getPartition(TextPair key, Text value, int numParititon) {         return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() * 127) % numParititon;        }   }      public static class Comp1 extends WritableComparator {       public Comp1() {        super(TextPair.class, true);       }       @SuppressWarnings("unchecked")       public int compare(WritableComparable a, WritableComparable b) {        TextPair t1 = (TextPair) a;        TextPair t2 = (TextPair) b;        int comp= t1.getFirst().compareTo(t2.getFirst());        if (comp!=0)         return comp;        return t1.getSecond().compareTo(t2.getSecond());       }     }     public static class TokenizerMapper           extends Mapper<LongWritable, Text, TextPair, Text>{           Text val=new Text("test");       public void map(LongWritable key, Text value, Context context                       ) throws IOException, InterruptedException {                         &nbsp; String s[]=value.toString().split("\001");                  TextPair tp=new TextPair(s[0],s[1],s[4]+s[3]); //thedate cookie city+houseid           context.write(tp, val);       }     }          public static class IntSumReducer           extends Reducer<TextPair,Text,Text,Text> {      private static String comparedColumn[] = new String[3];      ArrayList<String> houselist= new ArrayList<String>();      private static Text keyv = new Text();            private static Text valuev = new Text();      static Logger logger = Logger.getLogger(HouseMergeAndSplit.class.getName());             public void reduce(TextPair key, Iterable<Text> values,                           Context context                          ) throws IOException, InterruptedException {                houselist.clear();        String thedate=key.getFirst().toString();        String cookie=key.getSecond().toString();                    for (int i=0;i<3;i++)         comparedColumn[i]="";                //first+second为分组键,每次不同重新调用reduce函数        for (Text val:values)        {                if (thedate.equals(comparedColumn[0]) && cookie.equals(comparedColumn[1])&&  !key.getThree().toString().equals(comparedColumn[2]))          {          // context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" first"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));           houselist.add(key.getThree().toString());                      comparedColumn[0]=key.getFirst().toString();             comparedColumn[1]=key.getSecond().toString();             comparedColumn[2]=key.getThree().toString();                      }                        if (!thedate.equals(comparedColumn[0])||!cookie.equals(comparedColumn[1]))                       {                      //  context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" second"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));             houselist.add(key.getThree().toString());             comparedColumn[0]=key.getFirst().toString();             comparedColumn[1]=key.getSecond().toString();             comparedColumn[2]=key.getThree().toString();                          }                                      }                    keyv.set(comparedColumn[0]); //日期        //valuev.set(houselist.toString());        //logger.info(houselist.toString());        //context.write(keyv,valuev);                        for (int i=0;i<houselist.size()-1;i++)        {         for (int j=i+1;j<houselist.size();j++)         {    valuev.set(houselist.get(i)+" "+houselist.get(j)); //关联的楼盘          context.write(keyv,valuev);         }        }                }     }       public static void main(String[] args) throws Exception {       Configuration conf = new Configuration();       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();       if (otherArgs.length != 2) {         System.err.println("Usage: wordcount <in> <out>");         System.exit(2);       }              FileSystem fstm = FileSystem.get(conf);          Path outDir = new Path(otherArgs[1]);          fstm.delete(outDir, true);            conf.set("mapred.textoutputformat.separator", "\t"); //reduce输出时key value中间的分隔符       Job job = new Job(conf, "HouseMergeAndSplit");       job.setNumReduceTasks(4);       job.setJarByClass(HouseMergeAndSplit.class);       job.setMapperClass(TokenizerMapper.class);              job.setMapOutputKeyClass(TextPair.class);       job.setMapOutputValueClass(Text.class);       // 设置partition       job.setPartitionerClass(Partitioner1.class);       // 在分区之后按照指定的条件分组       job.setGroupingComparatorClass(Comp1.class);       // 设置reduce       // 设置reduce的输出       job.setReducerClass(IntSumReducer.class);       job.setOutputKeyClass(Text.class);       job.setOutputValueClass(Text.class);       //job.setNumReduceTasks(18);       FileInputFormat.addInputPath(job, new Path(otherArgs[0]));       FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));       System.exit(job.waitForCompletion(true) ? 0 : 1);     }  }
TextPair
import java.io.DataInput;  import java.io.DataOutput;  import java.io.IOException;    import org.apache.hadoop.io.Text;  import org.apache.hadoop.io.WritableComparable;    public class TextPair implements WritableComparable<TextPair> {   private Text first;   private Text second;   private Text three;   public TextPair() {     set(new Text(), new Text(),new Text());   }   public TextPair(String first, String second,String three) {     set(new Text(first), new Text(second),new Text(three));   }   public TextPair(Text first, Text second,Text Three) {     set(first, second,three);   }   public void set(Text first, Text second,Text three) {     this.first = first;     this.second = second;     this.three=three;   }   public Text getFirst() {     return first;   }   public Text getSecond() {     return second;   }   public Text getThree() {      return three;    }   public void write(DataOutput out) throws IOException {     first.write(out);     second.write(out);     three.write(out);   }   public void readFields(DataInput in) throws IOException {     first.readFields(in);     second.readFields(in);     three.readFields(in);   }   public int compareTo(TextPair tp) {     int cmp = first.compareTo(tp.first);     if (cmp != 0) {      return cmp;     }     cmp= second.compareTo(tp.second);     if (cmp != 0) {       return cmp;      }     return three.compareTo(tp.three);   }   }
TextPairSecond
import java.io.DataInput;  import java.io.DataOutput;  import java.io.IOException;    import org.apache.hadoop.io.FloatWritable;  import org.apache.hadoop.io.Text;  import org.apache.hadoop.io.WritableComparable;    public class TextPairSecond implements WritableComparable<TextPairSecond> {   private Text first;   private FloatWritable second;   public TextPairSecond() {     set(new Text(), new FloatWritable());   }   public TextPairSecond(String first, float second) {     set(new Text(first), new FloatWritable(second));   }   public TextPairSecond(Text first, FloatWritable second) {     set(first, second);   }   public void set(Text first, FloatWritable second) {     this.first = first;     this.second = second;   }   public Text getFirst() {     return first;   }   public FloatWritable getSecond() {     return second;   }   public void write(DataOutput out) throws IOException {     first.write(out);     second.write(out);   }   public void readFields(DataInput in) throws IOException {     first.readFields(in);     second.readFields(in);   }   public int compareTo(TextPairSecond tp) {     int cmp = first.compareTo(tp.first);     if (cmp != 0) {      return cmp;     }     return second.compareTo(tp.second);   }     }

第二个MR

import java.io.IOException;  import java.text.SimpleDateFormat;  import java.util.ArrayList;  import java.util.Date;    import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.fs.FileSystem;  import org.apache.hadoop.fs.Path;  import org.apache.hadoop.io.IntWritable;  import org.apache.hadoop.io.LongWritable;  import org.apache.hadoop.io.NullWritable;  import org.apache.hadoop.io.Text;  import org.apache.hadoop.io.WritableComparable;  import org.apache.hadoop.io.WritableComparator;  import org.apache.hadoop.mapred.OutputCollector;  import org.apache.hadoop.mapreduce.Job;  import org.apache.hadoop.mapreduce.Mapper;  import org.apache.hadoop.mapreduce.Partitioner;  import org.apache.hadoop.mapreduce.Reducer;    import org.apache.hadoop.mapreduce.Mapper.Context;  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  import org.apache.hadoop.util.GenericOptionsParser;  import org.apache.log4j.Logger;      /*   *  统计楼盘之间共同出现的次数   * 输入:   * 日期 楼盘1 楼盘2   *    * 输出:   * 日期 楼盘1 楼盘2 共同出现的次数   *    */    public class HouseCount {          public static class TokenizerMapper           extends Mapper<LongWritable, Text, Text, IntWritable>{                IntWritable iw=new IntWritable(1);       public void map(LongWritable key, Text value, Context context                       ) throws IOException, InterruptedException {                       context.write(value, iw);       }     }          public static class IntSumReducer           extends Reducer<Text,IntWritable,Text,IntWritable> {        IntWritable result=new IntWritable();       public void reduce(Text key, Iterable<IntWritable> values,                           Context context                          ) throws IOException, InterruptedException {                 int sum=0;         for (IntWritable iw:values)         {          sum+=iw.get();         }         result.set(sum);        context.write(key, result) ;               }     }       public static void main(String[] args) throws Exception {       Configuration conf = new Configuration();       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();       if (otherArgs.length != 2) {         System.err.println("Usage: wordcount <in> <out>");         System.exit(2);       }              FileSystem fstm = FileSystem.get(conf);          Path outDir = new Path(otherArgs[1]);          fstm.delete(outDir, true);            conf.set("mapred.textoutputformat.separator", "\t"); //reduce输出时key value中间的分隔符       Job job = new Job(conf, "HouseCount");       job.setNumReduceTasks(2);       job.setJarByClass(HouseCount.class);       job.setMapperClass(TokenizerMapper.class);              job.setMapOutputKeyClass(Text.class);       job.setMapOutputValueClass(IntWritable.class);          // 设置reduce       // 设置reduce的输出       job.setReducerClass(IntSumReducer.class);       job.setOutputKeyClass(Text.class);       job.setOutputValueClass(IntWritable.class);       //job.setNumReduceTasks(18);       FileInputFormat.addInputPath(job, new Path(otherArgs[0]));       FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));       System.exit(job.waitForCompletion(true) ? 0 : 1);     }  }
第三个MR
import java.io.IOException;  import java.text.ParseException;  import java.text.SimpleDateFormat;  import java.util.ArrayList;  import java.util.Calendar;  import java.util.Date;    import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.fs.FileSystem;  import org.apache.hadoop.fs.Path;  import org.apache.hadoop.io.FloatWritable;  import org.apache.hadoop.io.IntWritable;  import org.apache.hadoop.io.LongWritable;  import org.apache.hadoop.io.NullWritable;  import org.apache.hadoop.io.Text;  import org.apache.hadoop.io.WritableComparable;  import org.apache.hadoop.io.WritableComparator;  import org.apache.hadoop.mapred.OutputCollector;  import org.apache.hadoop.mapreduce.Job;  import org.apache.hadoop.mapreduce.Mapper;  import org.apache.hadoop.mapreduce.Partitioner;  import org.apache.hadoop.mapreduce.Reducer;    import org.apache.hadoop.mapreduce.Mapper.Context;  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  import org.apache.hadoop.util.GenericOptionsParser;  import org.apache.log4j.Logger;      /*   * 汇总近三个月统计楼盘之间共同出现的次数,考虑衰减系数, 并最后a b 转成 b a输出一次   * 输入:   * 日期  楼盘1 楼盘2 共同出现的次数   *    * 输出   * 楼盘1 楼盘2 共同出现的次数(考虑了衰减系数,每天的衰减系数不一样)   *    */    public class HouseCountHz {          public static class HouseCountHzMapper           extends Mapper<LongWritable, Text, Text, FloatWritable>{          Text keyv=new Text();      FloatWritable valuev=new FloatWritable();       public void map(LongWritable key, Text value, Context context                       ) throws IOException, InterruptedException {               String[] s=value.toString().split("\t");       keyv.set(s[1]+" "+s[2]);//楼盘1,楼盘2       Calendar date1=Calendar.getInstance();      Calendar d2=Calendar.getInstance();         Date b = null;      SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");      try {        b=sdf.parse(s[0]);      } catch (ParseException e) {       e.printStackTrace();      }      d2.setTime(b);      long n=date1.getTimeInMillis();      long birth=d2.getTimeInMillis();      long sss=n-birth;      int day=(int)((sss)/(3600*24*1000)); //该条记录的日期与当前日期的日期差      float factor=1/(1+(float)(day-1)/10); //衰减系数       valuev.set(Float.parseFloat(s[3])*factor);               context.write(keyv, valuev);       }     }          public static class HouseCountHzReducer           extends Reducer<Text,FloatWritable,Text,FloatWritable> {         FloatWritable result=new FloatWritable();      Text keyreverse=new Text();       public void reduce(Text key, Iterable<FloatWritable> values,                           Context context                          ) throws IOException, InterruptedException {                 float sum=0;         for (FloatWritable iw:values)         {          sum+=iw.get();         }         result.set(sum);         String[] keys=key.toString().split("\t");         keyreverse.set(keys[1]+" "+keys[0]);        context.write(key, result) ;        context.write(keyreverse, result) ;               }     }       public static void main(String[] args) throws Exception {       Configuration conf = new Configuration();       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();       if (otherArgs.length != 2) {         System.err.println("Usage: wordcount <in> <out>");         System.exit(2);       }              FileSystem fstm = FileSystem.get(conf);          Path outDir = new Path(otherArgs[1]);          fstm.delete(outDir, true);            conf.set("mapred.textoutputformat.separator", "\t"); //reduce输出时key value中间的分隔符       Job job = new Job(conf, "HouseCountHz");       job.setNumReduceTasks(2);       job.setJarByClass(HouseCountHz.class);       job.setMapperClass(HouseCountHzMapper.class);              job.setMapOutputKeyClass(Text.class);       job.setMapOutputValueClass(FloatWritable.class);          // 设置reduce       // 设置reduce的输出       job.setReducerClass(HouseCountHzReducer.class);       job.setOutputKeyClass(Text.class);       job.setOutputValueClass(FloatWritable.class);       //job.setNumReduceTasks(18);       FileInputFormat.addInputPath(job, new Path(otherArgs[0]));       FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));       System.exit(job.waitForCompletion(true) ? 0 : 1);     }  }
第四个MR
import java.io.IOException;  import java.util.Iterator;      import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.fs.FileSystem;  import org.apache.hadoop.fs.Path;  import org.apache.hadoop.io.FloatWritable;    import org.apache.hadoop.io.LongWritable;    import org.apache.hadoop.io.Text;  import org.apache.hadoop.io.WritableComparable;  import org.apache.hadoop.io.WritableComparator;    import org.apache.hadoop.mapreduce.Job;  import org.apache.hadoop.mapreduce.Mapper;  import org.apache.hadoop.mapreduce.Partitioner;  import org.apache.hadoop.mapreduce.Reducer;    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  import org.apache.hadoop.util.GenericOptionsParser;        /*   * 输入数据:   * 楼盘1 楼盘2 共同出现的次数   *    * 输出数据   *  楼盘1 楼盘2,楼盘3,楼盘4 (按次数排序)   */    public class HouseRowToCol {      public static class Partitioner1 extends Partitioner<TextPairSecond, Text> {      @Override      //分区      public int getPartition(TextPairSecond key, Text value, int numParititon) {          return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() * 127) % numParititon;        }   }   //分组      public static class Comp1 extends WritableComparator {       public Comp1() {        super(TextPairSecond.class, true);       }       @SuppressWarnings("unchecked")       public int compare(WritableComparable a, WritableComparable b) {        TextPairSecond t1 = (TextPairSecond) a;        TextPairSecond t2 = (TextPairSecond) b;         return t1.getFirst().compareTo(t2.getFirst());         }     }            //排序      public static class KeyComp extends WritableComparator {       public KeyComp() {        super(TextPairSecond.class, true);       }       @SuppressWarnings("unchecked")       public int compare(WritableComparable a, WritableComparable b) {        TextPairSecond t1 = (TextPairSecond) a;        TextPairSecond t2 = (TextPairSecond) b;        int comp= t1.getFirst().compareTo(t2.getFirst());        if (comp!=0)         return comp;        return -t1.getSecond().compareTo(t2.getSecond());       }     }      public static class HouseRowToColMapper           extends Mapper<LongWritable, Text, TextPairSecond, Text>{        Text houseid1=new Text();      Text houseid2=new Text();      FloatWritable weight=new FloatWritable();       public void map(LongWritable key, Text value, Context context                       ) throws IOException, InterruptedException {                String s[]=value.toString().split("\t");              weight.set(Float.parseFloat(s[2]));          houseid1.set(s[0]);          houseid2.set(s[1]);        TextPairSecond tp=new TextPairSecond(houseid1,weight);         context.write(tp, houseid2);       }     }          public static class HouseRowToColReducer           extends Reducer<TextPairSecond,Text,Text,Text> {            Text valuev=new Text();       public void reduce(TextPairSecond key, Iterable<Text> values,                           Context context                          ) throws IOException, InterruptedException {        Text keyv=key.getFirst();        Iterator<Text> it=values.iterator();        StringBuilder sb=new StringBuilder(it.next().toString());        while(it.hasNext())        {         sb.append(","+it.next().toString());        }        valuev.set(sb.toString());        context.write(keyv, valuev);                               }     }       public static void main(String[] args) throws Exception {       Configuration conf = new Configuration();       String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();       if (otherArgs.length != 2) {         System.err.println("Usage: wordcount <in> <out>");         System.exit(2);       }              FileSystem fstm = FileSystem.get(conf);          Path outDir = new Path(otherArgs[1]);          fstm.delete(outDir, true);            conf.set("mapred.textoutputformat.separator", "\t"); //reduce输出时key value中间的分隔符       Job job = new Job(conf, "HouseRowToCol");       job.setNumReduceTasks(4);       job.setJarByClass(HouseRowToCol.class);       job.setMapperClass(HouseRowToColMapper.class);              job.setMapOutputKeyClass(TextPairSecond.class);       job.setMapOutputValueClass(Text.class);       // 设置partition       job.setPartitionerClass(Partitioner1.class);       // 在分区之后按照指定的条件分组       job.setGroupingComparatorClass(Comp1.class);       job.setSortComparatorClass(KeyComp.class);       // 设置reduce       // 设置reduce的输出       job.setReducerClass(HouseRowToColReducer.class);       job.setOutputKeyClass(Text.class);       job.setOutputValueClass(Text.class);       //job.setNumReduceTasks(18);       FileInputFormat.addInputPath(job, new Path(otherArgs[0]));       FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));       System.exit(job.waitForCompletion(true) ? 0 : 1);     }  }
来自:http://blog.csdn.net/u011750989/article/details/12004065