| | 1 | {{{ |
| | 2 | #!html |
| | 3 | <div style="text-align: center; color:#151B8D"><big style="font-weight: bold;"><big><big> |
| | 4 | ITRI HBase 進階課程 |
| | 5 | </big></big></big></div> <div style="text-align: center; color:#7E2217"><big style="font-weight: bold;"><big> |
| | 6 | HBase 範例 |
| | 7 | </big></big></div> |
| | 8 | }}} |
| | 9 | |
| | 10 | [wiki:waue/2011/0426_4_5 上一關 < ] 第六關 [wiki:waue/2011/0426_4_7 > 下一關] |
| | 11 | |
| | 12 | = 範例六:WordCountHBase = |
| | 13 | |
| | 14 | == 說明: == |
| | 15 | |
| | 16 | * 此程式碼將輸入路徑的檔案內的字串取出做字數統計,再將結果塞回HTable內 |
| | 17 | * 請注意在將hbase 等函式庫放入hadoop 的lib 目錄後,必須重新啟動hbase 與 hadoop 再執行此範例程式才不會出現錯誤 |
| | 18 | |
| | 19 | {{{ |
| | 20 | $ bin/hadoop dfs -mkdir input |
| | 21 | $ bin/hadoop dfs -put README.txt input |
| | 22 | $ bin/hadoop jar ItriMenu.jar CountToHBaseReducer input |
| | 23 | }}} |
| | 24 | |
| | 25 | == 注意: == |
| | 26 | |
| | 27 | 1. 在hdfs 上來源檔案的路徑為 "/user/$YOUR_NAME/input" |
| | 28 | |
| | 29 | 請注意必須先放資料到此hdfs上的資料夾內,且此資料夾內只能放檔案,不可再放資料夾 |
| | 30 | |
| | 31 | 2. 運算完後,程式將執行結果放在hbase的wordcount資料表內 |
| | 32 | |
| | 33 | |
| | 34 | |
| | 35 | == 程式碼 == |
| | 36 | |
| | 37 | {{{ |
| | 38 | #!java |
| | 39 | |
| | 40 | package itri; |
| | 41 | |
| | 42 | |
| | 43 | import java.io.IOException; |
| | 44 | |
| | 45 | import org.apache.hadoop.conf.Configuration; |
| | 46 | import org.apache.hadoop.fs.Path; |
| | 47 | import org.apache.hadoop.hbase.client.Put; |
| | 48 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; |
| | 49 | import org.apache.hadoop.hbase.mapreduce.TableReducer; |
| | 50 | import org.apache.hadoop.hbase.util.Bytes; |
| | 51 | import org.apache.hadoop.io.IntWritable; |
| | 52 | import org.apache.hadoop.io.LongWritable; |
| | 53 | import org.apache.hadoop.io.Text; |
| | 54 | import org.apache.hadoop.mapreduce.Job; |
| | 55 | import org.apache.hadoop.mapreduce.Mapper; |
| | 56 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; |
| | 57 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; |
| | 58 | import org.apache.hadoop.util.GenericOptionsParser; |
| | 59 | |
| | 60 | public class CountToHBaseReducer { |
| | 61 | public static class HtMap extends |
| | 62 | Mapper<LongWritable, Text, Text, IntWritable> { |
| | 63 | private IntWritable one = new IntWritable(1); |
| | 64 | |
| | 65 | public void map(LongWritable key, Text value, Context context) |
| | 66 | throws IOException, InterruptedException { |
| | 67 | // 輸入的字串先轉換成小寫再用空白區隔 |
| | 68 | String s[] = value.toString().toLowerCase().trim().split(" "); |
| | 69 | for (String m : s) { |
| | 70 | // 寫入到輸出串流 |
| | 71 | context.write(new Text(m), one); |
| | 72 | } |
| | 73 | } |
| | 74 | } |
| | 75 | |
| | 76 | // TableReducer<KEYIN,VALUEIN,KEYOUT> |
| | 77 | // 原本為 TableReducer<Text, IntWritable, NullWritable > |
| | 78 | // 但在此改成 LongWritable 也可以 |
| | 79 | // 因此證明在此的Class可以很多,org.apache.hadoop.io.* 內有write()的Writable class應該皆可 |
| | 80 | public static class HtReduce extends |
| | 81 | TableReducer<Text, IntWritable, LongWritable> { |
| | 82 | |
| | 83 | public void reduce(Text key, Iterable<IntWritable> values, |
| | 84 | Context context) throws IOException, InterruptedException { |
| | 85 | int sum = 0; |
| | 86 | for (IntWritable i : values) { |
| | 87 | sum += i.get(); |
| | 88 | } |
| | 89 | |
| | 90 | // org.apache.hadoop.hbase.client.Put |
| | 91 | // Used to perform Put operations for a single row. |
| | 92 | // new Put(byte[] row) |
| | 93 | Put put = new Put(Bytes.toBytes(key.toString())); |
| | 94 | |
| | 95 | // add(byte[] family, byte[] qualifier, byte[] value) |
| | 96 | // 在main設定output format class 為 TableOutputFormat |
| | 97 | // TableReducer 內有定義 output Key class 必須為 Put 或 Delete |
| | 98 | put.add(Bytes.toBytes("content"), Bytes.toBytes("word"), Bytes |
| | 99 | .toBytes(key.toString())); |
| | 100 | put.add(Bytes.toBytes("content"), Bytes.toBytes("count"), Bytes |
| | 101 | .toBytes(String.valueOf(sum))); |
| | 102 | |
| | 103 | // NullWritable.get(): Returns the single instance of this class. |
| | 104 | // NullWritable.write(): Serialize the fields of this object to out. |
| | 105 | context.write(new LongWritable(), put); |
| | 106 | // context.write(NullWritable.get(), put) |
| | 107 | } |
| | 108 | } |
| | 109 | |
| | 110 | public static void main(String argv[]) throws Exception { |
| | 111 | // String[] argc = {"input"}; argv = argc ; // in hdfs |
| | 112 | |
| | 113 | Configuration conf = new Configuration(); |
| | 114 | String[] otherArgs = new GenericOptionsParser(conf, argv) |
| | 115 | .getRemainingArgs(); |
| | 116 | if (otherArgs.length < 1) { |
| | 117 | System.out.println("CountToHBaseReducer <inHdfsDir> "); |
| | 118 | return; |
| | 119 | } |
| | 120 | |
| | 121 | String input = otherArgs[0]; |
| | 122 | |
| | 123 | String tablename = "wordcounthbase"; |
| | 124 | |
| | 125 | |
| | 126 | // OUTPUT_TABLE = "hbase.mapred.outputtable" |
| | 127 | // conf.set 用於設定 如 core-site.xml 的 name 與 value |
| | 128 | // 告訴程式 hbase.mapred.outputtable --> wordcount |
| | 129 | conf.set(TableOutputFormat.OUTPUT_TABLE, tablename); |
| | 130 | // 建立hbase 的table 否則沒先建立會出錯 |
| | 131 | CreateTable.createHBaseTable(tablename, "content"); |
| | 132 | |
| | 133 | Job job = new Job(conf, "WordCount table with " + input); |
| | 134 | |
| | 135 | job.setJarByClass(CountToHBaseReducer.class); |
| | 136 | |
| | 137 | job.setMapperClass(HtMap.class); |
| | 138 | job.setReducerClass(HtReduce.class); |
| | 139 | // 此範例的輸出為 <Text,IntWritable> 因此其實可以省略宣告 |
| | 140 | // set{Map|Reduce}Output{Key|Value}Class() |
| | 141 | job.setMapOutputKeyClass(Text.class); |
| | 142 | job.setMapOutputValueClass(IntWritable.class); |
| | 143 | // InputFormat 只有一個子介面 |
| | 144 | // FileInputFormat <-(SequenceFileInputFormat,TextInputFormat) |
| | 145 | // 其中TextInputFormat 最常用 ,預設輸入為 LongWritable,Text |
| | 146 | // 另外HBase 則設計了一個子類別 TableInputFormat |
| | 147 | job.setInputFormatClass(TextInputFormat.class); |
| | 148 | // TAbleOutputFormat |
| | 149 | // 宣告此行則可使 reduce 輸出為 HBase 的table |
| | 150 | job.setOutputFormatClass(TableOutputFormat.class); |
| | 151 | |
| | 152 | // 原本設定輸入檔案為 Config.setInputPath(Path) 卻改為 |
| | 153 | // FileInputFormat.addInputPath(Job, Path()) 的設計, |
| | 154 | // 猜測應該是考慮某些檔案操作並不需要跑mapreduce的Job,因此提到外面 |
| | 155 | FileInputFormat.addInputPath(job, new Path(input)); |
| | 156 | |
| | 157 | System.exit(job.waitForCompletion(true) ? 0 : 1); |
| | 158 | } |
| | 159 | } |
| | 160 | }}} |
| | 161 | |
| | 162 | = 執行測試 = |
| | 163 | |
| | 164 | {{{ |
| | 165 | $ /opt/hbase/bin/hbase shell |
| | 166 | hbase(main):x:0> list |
| | 167 | wordcount |
| | 168 | 1 row(s) in 0.0240 seconds |
| | 169 | hbase(main):x:0> scan 'wordcount' |
| | 170 | ..... |
| | 171 | zeller column=content:count, timestamp=1285674576293, value=1 |
| | 172 | zero column=content:count, timestamp=1285674576293, value=8 |
| | 173 | zero, column=content:count, timestamp=1285674576293, value=2 |
| | 174 | zero-compressed column=content:count, timestamp=1285674576293, value=1 |
| | 175 | ..... |
| | 176 | hbase(main):x:0> exit |
| | 177 | }}} |