ITRI HBase 進階課程
HBase 範例
範例六:WordCountHBase
說明:
- 此程式碼將輸入路徑的檔案內的字串取出做字數統計,再將結果塞回HTable內
- 請注意在將hbase 等函式庫放入hadoop 的lib 目錄後,必須重新啟動hbase 與 hadoop 再執行此範例程式才不會出現錯誤
$ bin/hadoop dfs -mkdir input $ bin/hadoop dfs -put README.txt input $ bin/hadoop jar ItriMenu.jar CountToHBaseReducer input
注意:
- 在hdfs 上來源檔案的路徑為 "/user/$YOUR_NAME/input"
請注意必須先放資料到此hdfs上的資料夾內,且此資料夾內只能放檔案,不可再放資料夾
- 運算完後,程式將執行結果放在hbase的wordcount資料表內
程式碼
package itri; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; import org.apache.hadoop.hbase.mapreduce.TableReducer; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class CountToHBaseReducer { public static class HtMap extends Mapper<LongWritable, Text, Text, IntWritable> { private IntWritable one = new IntWritable(1); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 輸入的字串先轉換成小寫再用空白區隔 String s[] = value.toString().toLowerCase().trim().split(" "); for (String m : s) { // 寫入到輸出串流 context.write(new Text(m), one); } } } // TableReducer<KEYIN,VALUEIN,KEYOUT> // 原本為 TableReducer<Text, IntWritable, NullWritable > // 但在此改成 LongWritable 也可以 // 因此證明在此的Class可以很多,org.apache.hadoop.io.* 內有write()的Writable class應該皆可 public static class HtReduce extends TableReducer<Text, IntWritable, LongWritable> { public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable i : values) { sum += i.get(); } // org.apache.hadoop.hbase.client.Put // Used to perform Put operations for a single row. // new Put(byte[] row) Put put = new Put(Bytes.toBytes(key.toString())); // add(byte[] family, byte[] qualifier, byte[] value) // 在main設定output format class 為 TableOutputFormat // TableReducer 內有定義 output Key class 必須為 Put 或 Delete put.add(Bytes.toBytes("content"), Bytes.toBytes("word"), Bytes .toBytes(key.toString())); put.add(Bytes.toBytes("content"), Bytes.toBytes("count"), Bytes .toBytes(String.valueOf(sum))); // NullWritable.get(): Returns the single instance of this class. // NullWritable.write(): Serialize the fields of this object to out. context.write(new LongWritable(), put); // context.write(NullWritable.get(), put) } } public static void main(String argv[]) throws Exception { // String[] argc = {"input"}; argv = argc ; // in hdfs Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, argv) .getRemainingArgs(); if (otherArgs.length < 1) { System.out.println("CountToHBaseReducer <inHdfsDir> "); return; } String input = otherArgs[0]; String tablename = "wordcounthbase"; // OUTPUT_TABLE = "hbase.mapred.outputtable" // conf.set 用於設定 如 core-site.xml 的 name 與 value // 告訴程式 hbase.mapred.outputtable --> wordcount conf.set(TableOutputFormat.OUTPUT_TABLE, tablename); // 建立hbase 的table 否則沒先建立會出錯 CreateTable.createHBaseTable(tablename, "content"); Job job = new Job(conf, "WordCount table with " + input); job.setJarByClass(CountToHBaseReducer.class); job.setMapperClass(HtMap.class); job.setReducerClass(HtReduce.class); // 此範例的輸出為 <Text,IntWritable> 因此其實可以省略宣告 // set{Map|Reduce}Output{Key|Value}Class() job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); // InputFormat 只有一個子介面 // FileInputFormat <-(SequenceFileInputFormat,TextInputFormat) // 其中TextInputFormat 最常用 ,預設輸入為 LongWritable,Text // 另外HBase 則設計了一個子類別 TableInputFormat job.setInputFormatClass(TextInputFormat.class); // TAbleOutputFormat // 宣告此行則可使 reduce 輸出為 HBase 的table job.setOutputFormatClass(TableOutputFormat.class); // 原本設定輸入檔案為 Config.setInputPath(Path) 卻改為 // FileInputFormat.addInputPath(Job, Path()) 的設計, // 猜測應該是考慮某些檔案操作並不需要跑mapreduce的Job,因此提到外面 FileInputFormat.addInputPath(job, new Path(input)); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
執行測試
$ /opt/hbase/bin/hbase shell hbase(main):x:0> list wordcount 1 row(s) in 0.0240 seconds hbase(main):x:0> scan 'wordcount' ..... zeller column=content:count, timestamp=1285674576293, value=1 zero column=content:count, timestamp=1285674576293, value=8 zero, column=content:count, timestamp=1285674576293, value=2 zero-compressed column=content:count, timestamp=1285674576293, value=1 ..... hbase(main):x:0> exit
Last modified 14 years ago
Last modified on Apr 25, 2011, 3:55:44 PM