wiki:waue/2010/0204-06

範例六:WordCountHBase

說明:

此程式碼將輸入路徑的檔案內的字串取出做字數統計,再將結果塞回HTable內

執行方法

測試用打包檔 tsmcHBase_100204.jar

$ /opt/hadoop/bin/hadoop jar tsmcHBase_100204.jar CountToHBaseReducer

結果:

$ hbase shell
	> scan 'wordcount'
 	ROW	COLUMN+CELL 
	am	column=content:count, timestamp=1264406245488, value=1
        chen	column=content:count, timestamp=1264406245488, value=1 
 	hi,	column=content:count, timestamp=1264406245488, value=2
  ......(略)

注意:

  1. 在hdfs 上來源檔案的路徑為 "/user/$YOUR_NAME/input"

請注意必須先放資料到此hdfs上的資料夾內,且此資料夾內只能放檔案,不可再放資料夾

  1. 運算完後,程式將執行結果放在hbase的wordcount資料表內

參考:

1.程式碼改編於: http://blog.ring.idv.tw/comment.ser?i=337

2.hbase 運作 mapreduce 程式的方法參考於:http://wiki.apache.org/hadoop/Hbase/MapReduce

程式碼

package tsmc;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

public class CountToHBaseReducer {
  public static class HtMap extends
      Mapper<LongWritable, Text, Text, IntWritable> {
    private IntWritable one = new IntWritable(1);

    public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      // 輸入的字串先轉換成小寫再用空白區隔
      String s[] = value.toString().toLowerCase().trim().split(" ");
      for (String m : s) {
        // 寫入到輸出串流
        context.write(new Text(m), one);
      }
    }
  }

  // TableReducer<KEYIN,VALUEIN,KEYOUT>
  // 原本為 TableReducer<Text, IntWritable, NullWritable >
  // 但在此改成 LongWritable 也可以
  // 因此證明在此的Class可以很多,org.apache.hadoop.io.* 內有write()的Writable class應該皆可
  public static class HtReduce extends
      TableReducer<Text, IntWritable, LongWritable> {

    public void reduce(Text key, Iterable<IntWritable> values,
        Context context) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable i : values) {
        sum += i.get();
      }

      // org.apache.hadoop.hbase.client.Put
      // Used to perform Put operations for a single row.
      // new Put(byte[] row)
      Put put = new Put(Bytes.toBytes(key.toString()));

      // add(byte[] family, byte[] qualifier, byte[] value)
      // 在main設定output format class 為 TableOutputFormat
      // TableReducer 內有定義 output Key class 必須為 Put 或 Delete
      put.add(Bytes.toBytes("content"), Bytes.toBytes("count"), Bytes
          .toBytes(String.valueOf(sum)));

      // NullWritable.get(): Returns the single instance of this class.
      // NullWritable.write(): Serialize the fields of this object to out.
      context.write(new LongWritable(), put);
      // context.write(NullWritable.get(), put)
    }
  }

  public static void main(String args[]) throws Exception {
    // debug
    String[] argv = { "/user/waue/input" };
    args = argv;
    String input = args[0];

    String tablename = "wordcount";
    String family = "content";

    Configuration conf = new Configuration();
    // OUTPUT_TABLE = "hbase.mapred.outputtable"
    // conf.set 用於設定 如 core-site.xml 的 name 與 value
    // 告訴程式 hbase.mapred.outputtable --> wordcount
    conf.set(TableOutputFormat.OUTPUT_TABLE, tablename);
    // 建立hbase 的table 否則沒先建立會出錯
    CreateTable.createHBaseTable(tablename, family);

    Job job = new Job(conf, "WordCount table with " + input);

    job.setJarByClass(CountToHBaseReducer.class);

    job.setMapperClass(HtMap.class);
    job.setReducerClass(HtReduce.class);
    // 此範例的輸出為 <Text,IntWritable> 因此其實可以省略宣告
    // set{Map|Reduce}Output{Key|Value}Class()
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    // InputFormat 只有一個子介面
    // FileInputFormat <-(SequenceFileInputFormat,TextInputFormat)
    // 其中TextInputFormat 最常用 ,預設輸入為 LongWritable,Text
    // 另外HBase 則設計了一個子類別 TableInputFormat
    job.setInputFormatClass(TextInputFormat.class);
    // TAbleOutputFormat
    // 宣告此行則可使 reduce 輸出為 HBase 的table
    job.setOutputFormatClass(TableOutputFormat.class);

    // 原本設定輸入檔案為 Config.setInputPath(Path) 卻改為
    // FileInputFormat.addInputPath(Job, Path()) 的設計,
    // 猜測應該是考慮某些檔案操作並不需要跑mapreduce的Job,因此提到外面
    FileInputFormat.addInputPath(job, new Path(input));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}
Last modified 14 years ago Last modified on Feb 5, 2010, 12:00:16 AM