Hbase基于Mapreduce的编程

openkk 12年前

小试牛刀,将mapreduce的输出结果保存到大型分布式数据库中HBase中,一个例子,求各url的访问pv数据,由于用到rcfile格式需要导入hive-exce包,还需要加载hbase包,如果这两个包都已经被集群管理员放到各节点的hadoop/lib下那就可以省去这一步,废话不说,干货,看代码:

package test.hbase;    import java.io.IOException;  import java.util.HashSet;  import java.util.Set;    import org.apache.hadoop.conf.Configuration;  import org.apache.hadoop.fs.FileSystem;  import org.apache.hadoop.fs.Path;  import org.apache.hadoop.hbase.HBaseConfiguration;  import org.apache.hadoop.hbase.HColumnDescriptor;  import org.apache.hadoop.hbase.HTableDescriptor;  import org.apache.hadoop.hbase.client.HBaseAdmin;  import org.apache.hadoop.hbase.client.Put;  import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;  import org.apache.hadoop.hbase.mapreduce.TableReducer;  import org.apache.hadoop.hbase.util.Bytes;  import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;  import org.apache.hadoop.io.IntWritable;  import org.apache.hadoop.io.LongWritable;  import org.apache.hadoop.io.NullWritable;  import org.apache.hadoop.io.Text;  import org.apache.hadoop.mapreduce.Job;  import org.apache.hadoop.mapreduce.Mapper;  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;    import com.sohu.tv.dm.common.RCFileInputFormat;    public class URLCountHbase {   public static class HBaseMap extends     Mapper<LongWritable, BytesRefArrayWritable, Text, IntWritable> {      private IntWritable i = new IntWritable(1);      @Override    protected void map(LongWritable key, BytesRefArrayWritable value,      Context context) throws IOException, InterruptedException {     byte[] url = value.get(4).getBytesCopy();     context.write(new Text(url), i);    }     }     public static class HBaseReduce extends     TableReducer<Text, IntWritable, NullWritable> {      @Override    protected void reduce(Text key, Iterable<IntWritable> values,      Context context) throws IOException, InterruptedException {     int sum = 0;     for (IntWritable i : values) {      sum += i.get();     }     Put put = new Put(Bytes.toBytes(key.toString()));     put.add(Bytes.toBytes("type"), Bytes.toBytes("count"),       Bytes.toBytes(String.valueOf(sum)));     context.write(NullWritable.get(), put);    }     }     public static void createHbaseTable(String tablename) throws IOException {    HTableDescriptor htd = new HTableDescriptor(tablename);    HColumnDescriptor col = new HColumnDescriptor("type");    htd.addFamily(col);    HBaseConfiguration config = new HBaseConfiguration();    HBaseAdmin admin = new HBaseAdmin(config);    if (admin.tableExists(tablename)) {     System.out.println("table exists, trying recreate table");     admin.disableTable(tablename);     admin.deleteTable(tablename);    }    System.out.println("create new table:" + tablename);    admin.createTable(htd);     }     public static void main(String args[]) throws Exception {    String tablename = "urlcount";    Configuration conf = new Configuration();    final FileSystem fs = FileSystem.getLocal(conf);    final HashSet<String> localfiles = new HashSet<String>();    localfiles.add("/opt/hadoop/hive-0.8.1/lib/hive-exec-0.8.1.jar");    localfiles.add("/opt/hadoop/hbase/hbase-0.92.1.jar");    final HashSet<String> files = new HashSet<String>();    for (String s : localfiles) {     files.add(URLCountHbase.convertPath(s, fs));    }    URLCountHbase.cacheJars(conf, files);    conf.set(TableOutputFormat.OUTPUT_TABLE, tablename);    createHbaseTable(tablename);    Job job = new Job(conf, "WordCount table with " + args[0]);    job.setJarByClass(URLCountHbase.class);    job.setNumReduceTasks(3);    job.setReducerClass(HBaseReduce.class);    job.setMapperClass(HBaseMap.class);    job.setMapOutputKeyClass(Text.class);    job.setMapOutputValueClass(IntWritable.class);    job.setOutputFormatClass(TableOutputFormat.class);    job.setInputFormatClass(RCFileInputFormat.class);    FileInputFormat.setInputPaths(job, new Path(args[0]));    System.exit(job.waitForCompletion(true) ? 0 : 1);     }       private static String convertPath(String path, FileSystem fs) {           final Path p = new Path(path);           return p.makeQualified(fs).toString();    }      private static void cacheJars(Configuration job, Set<String> localUrls) throws IOException {        if (localUrls.isEmpty()) {               return;           }           final String tmpjars = job.get("tmpjars");           final StringBuilder sb = new StringBuilder();           if (null != tmpjars) {               sb.append(tmpjars);               sb.append(",");           }           sb.append(org.apache.hadoop.util.StringUtils.arrayToString(localUrls.toArray(new String[0])));           job.set("tmpjars", sb.toString());     }  }