Context Navigation

← Previous Change
Wiki History
Next Change →

0125

Timestamp:: Jan 25, 2010, 4:29:39 PM (14 years ago)
Author:: waue
Comment:: --

Legend:

: Unmodified
: Added
: Removed
: Modified

waue/2010/0125

                       v1
+ = hbase 程式範例 =
+ = 範例一 =
+{{{
+#!java
+package hbase020;
+// WordCountHBase
+//說明：
+//      此程式碼將輸入路徑的檔案內的字串取出做字數統計
+//      再將結果塞回HTable內
+//
+//運算方法：
+//      將此程式運作在hadoop 0.20 平台上，用(參考2)的方法加入hbase參數後，將此程式碼打包成XX.jar
+//  執行：
+//      ---------------------------
+//      hadoop jar XX.jar WordCountHBase <hdfs_input>
+//      ---------------------------
+//
+//結果：
+// ---------------------------
+//      $ hbase shell
+//      > scan 'wordcount'
+//      ROW             COLUMN+CELL
+//      am              column=content:count, timestamp=1264406245488, value=1
+//  chen        column=content:count, timestamp=1264406245488, value=1
+//      hi,             column=content:count, timestamp=1264406245488, value=2
+//  ......(略)
+// ---------------------------
+//注意：
+//1.    在hdfs 上來源檔案的路徑為 "/user/$YOUR_NAME/input"
+//      請注意必須先放資料到此hdfs上的資料夾內，且此資料夾內只能放檔案，不可再放資料夾
+//2.    運算完後，程式將執行結果放在hbase的wordcount資料表內
+//
+//參考：
+//      1.程式碼改編於： http://blog.ring.idv.tw/comment.ser?i=337
+//  2.hbase 運作 mapreduce 程式的方法參考於：http://wiki.apache.org/hadoop/Hbase/MapReduce
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.client.HBaseAdmin;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
+import org.apache.hadoop.hbase.mapreduce.TableReducer;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+public class WordCountHBase {
+        public static class HtMap extends
+                        Mapper<LongWritable, Text, Text, IntWritable> {
+                private IntWritable one = new IntWritable(1);
+                public void map(LongWritable key, Text value, Context context)
+                                throws IOException, InterruptedException {
+                        // 輸入的字串先轉換成小寫再用空白區隔
+                        String s[] = value.toString().toLowerCase().trim().split(" ");
+                        for (String m : s) {
+                                // 寫入到輸出串流
+                                context.write(new Text(m), one);
+                        }
+                }
+        }
+        // TableReducer<KEYIN,VALUEIN,KEYOUT>
+        // 原本為 TableReducer<Text, IntWritable, NullWritable >
+        // 但在此改成 LongWritable 也可以
+        // 因此證明在此的Class可以很多,org.apache.hadoop.io.* 內有write()的Writable class應該皆可
+        public static class HtReduce extends
+                        TableReducer<Text, IntWritable,LongWritable > {
+                public void reduce(Text key, Iterable<IntWritable> values,
+                                Context context) throws IOException, InterruptedException {
+                        int sum = 0;
+                        for (IntWritable i : values) {
+                                sum += i.get();
+                        }
+                        // org.apache.hadoop.hbase.client.Put
+                        // Used to perform Put operations for a single row.
+                        // new Put(byte[] row)
+                        Put put = new Put(Bytes.toBytes(key.toString()));
+                        // add(byte[] family, byte[] qualifier, byte[] value)
+                        // 在main設定output format class 為 TableOutputFormat
+                        // TableReducer 內有定義 output Key class 必須為 Put 或 Delete
+                        put.add(Bytes.toBytes("content"), Bytes.toBytes("count"), Bytes
+                                        .toBytes(String.valueOf(sum)));
+                        //NullWritable.get(): Returns the single instance of this class.
+                        //NullWritable.write(): Serialize the fields of this object to out.
+                        context.write(new LongWritable(), put);
+                        // context.write(NullWritable.get(), put)
+                }
+        }
+        public static void createHBaseTable(String tablename) throws IOException {
+                // HTableDescriptor contains the name of an HTable, and its column
+                // families
+                // HTableDescriptor 用來描述table的屬性
+                HTableDescriptor htd = new HTableDescriptor(tablename);
+                // HColumnDescriptor HColumnDescriptor contains information about a
+                // column family such as the number of versions, compression settings,
+                // etc.
+                // HTableDescriptor 透過 add() 方法來加入Column family
+                htd.addFamily(new HColumnDescriptor("content:"));
+                // HBaseConfiguration 能接收 hbase-site.xml 的設定值
+                HBaseConfiguration config = new HBaseConfiguration();
+                // 檔案的操作則使用 HBaseAdmin
+                HBaseAdmin admin = new HBaseAdmin(config);
+                // 檢查
+                if (admin.tableExists(tablename)) {
+                        // 停止
+                        admin.disableTable(tablename);
+                        // 刪除
+                        admin.deleteTable(tablename);
+                }
+                System.out.println("create new table: " + tablename);
+                // 建立
+                admin.createTable(htd);
+        }
+        public static void main(String args[]) throws Exception {
+                // debug
+                String[] argv = { "/user/waue/input" };
+                args = argv;
+                String input = args[0];
+                String tablename = "wordcount";
+                Configuration conf = new Configuration();
+                // OUTPUT_TABLE = "hbase.mapred.outputtable"
+                // conf.set 用於設定 如 core-site.xml 的 name 與 value
+                // 告訴程式 hbase.mapred.outputtable --> wordcount
+                conf.set(TableOutputFormat.OUTPUT_TABLE, tablename);
+                // 建立hbase 的table 否則沒先建立會出錯
+                createHBaseTable(tablename);
+                Job job = new Job(conf, "WordCount table with " + input);
+                job.setJarByClass(WordCountHBase.class);
+                job.setNumReduceTasks(1);
+                job.setMapperClass(HtMap.class);
+                job.setReducerClass(HtReduce.class);
+                // 此範例的輸出為 <Text,IntWritable> 因此其實可以省略宣告
+                // set{Map|Reduce}Output{Key|Value}Class()
+                job.setMapOutputKeyClass(Text.class);
+                job.setMapOutputValueClass(IntWritable.class);
+                // InputFormat 只有一個子介面
+                // FileInputFormat <-(SequenceFileInputFormat,TextInputFormat)
+                // 其中TextInputFormat 最常用 ，預設輸入為 LongWritable,Text
+                // 另外HBase 則設計了一個子類別 TableInputFormat
+                job.setInputFormatClass(TextInputFormat.class);
+                // TAbleOutputFormat
+                // 宣告此行則可使 reduce 輸出為 HBase 的table
+                job.setOutputFormatClass(TableOutputFormat.class);
+                // 原本設定輸入檔案為 Config.setInputPath(Path) 卻改為
+                // FileInputFormat.addInputPath(Job, Path()) 的設計，
+                // 猜測應該是考慮某些檔案操作並不需要跑mapreduce的Job，因此提到外面
+                FileInputFormat.addInputPath(job, new Path(input));
+                System.exit(job.waitForCompletion(true) ? 0 : 1);
+        }
+}
+}}}
+ = 範例二 =
+{{{
+#!java
+package hbase020;
+// from hbase website
+// http://hadoop.apache.org/hbase/docs/current/api/org/apache/.接.
+//  hadoop/hbase/client/package-summary.html#package_description
+import java.io.IOException;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HColumnDescriptor;
+import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.client.Get;
+import org.apache.hadoop.hbase.client.HBaseAdmin;
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.ResultScanner;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.util.Bytes;
+// Class that has nothing but a main.
+// Does a Put, Get and a Scan against an hbase table.
+public class MyLittleHBaseClient {
+        public static void createHBaseTable(String tablename) throws IOException {
+                // HTableDescriptor contains the name of an HTable, and its column
+                // families
+                // HTableDescriptor 用來描述table的屬性
+                HTableDescriptor htd = new HTableDescriptor(tablename);
+                // HColumnDescriptor HColumnDescriptor contains information about a
+                // column family such as the number of versions, compression settings,
+                // etc.
+                // HTableDescriptor 透過 add() 方法來加入Column family
+                htd.addFamily(new HColumnDescriptor("content:"));
+                // HBaseConfiguration 能接收 hbase-site.xml 的設定值
+                HBaseConfiguration config = new HBaseConfiguration();
+                // 檔案的操作則使用 HBaseAdmin
+                HBaseAdmin admin = new HBaseAdmin(config);
+                // 檢查
+                if (admin.tableExists(tablename)) {
+                        // 停止
+                        admin.disableTable(tablename);
+                        // 刪除
+                        admin.deleteTable(tablename);
+                }
+                System.out.println("create new table: " + tablename);
+                // 建立
+                admin.createTable(htd);
+        }
+        public static void main(String[] args) throws IOException {
+                // You need a configuration object to tell the client where to connect.
+                // When you create a HBaseConfiguration, it reads in whatever you've set
+                // into your hbase-site.xml and in hbase-default.xml, as long as these
+                // can
+                // be found on the CLASSPATH
+                HBaseConfiguration config = new HBaseConfiguration();
+                // This instantiates an HTable object that connects you to
+                // the "myLittleHBaseTable" table.
+                HTable table = new HTable(config, "myLittleHBaseTable");
+                // To add to a row, use Put. A Put constructor takes the name of the row
+                // you want to insert into as a byte array. In HBase, the Bytes class
+                // has
+                // utility for converting all kinds of java types to byte arrays. In the
+                // below, we are converting the String "myLittleRow" into a byte array
+                // to
+                // use as a row key for our update. Once you have a Put instance, you
+                // can
+                // adorn it by setting the names of columns you want to update on the
+                // row,
+                // the timestamp to use in your update, etc.If no timestamp, the server
+                // applies current time to the edits.
+                Put p = new Put(Bytes.toBytes("myLittleRow"));
+                // To set the value you'd like to update in the row 'myRow', specify the
+                // column family, column qualifier, and value of the table cell you'd
+                // like
+                // to update. The column family must already exist in your table schema.
+                // The qualifier can be anything. All must be specified as byte arrays
+                // as
+                // hbase is all about byte arrays. Lets pretend the table
+                // 'myLittleHBaseTable' was created with a family 'myLittleFamily'.
+                p.add(Bytes.toBytes("myLittleFamily"), Bytes.toBytes("someQualifier"),
+                                Bytes.toBytes("Some Value"));
+                // Once you've adorned your Put instance with all the updates you want
+                // to
+                // make, to commit it do the following (The HTable#put method takes the
+                // Put instance you've been building and pushes the changes you made
+                // into
+                // hbase)
+                table.put(p);
+                // Now, to retrieve the data we just wrote. The values that come back
+                // are
+                // Result instances. Generally, a Result is an object that will package
+                // up
+                // the hbase return into the form you find most palatable.
+                Get g = new Get(Bytes.toBytes("myLittleRow"));
+                Result r = table.get(g);
+                byte[] value = r.getValue(Bytes.toBytes("myLittleFamily"), Bytes
+                                .toBytes("someQualifier"));
+                // If we convert the value bytes, we should get back 'Some Value', the
+                // value we inserted at this location.
+                String valueStr = Bytes.toString(value);
+                System.out.println("GET: " + valueStr);
+                // Sometimes, you won't know the row you're looking for. In this case,
+                // you
+                // use a Scanner. This will give you cursor-like interface to the
+                // contents
+                // of the table. To set up a Scanner, do like you did above making a Put
+                // and a Get, create a Scan. Adorn it with column names, etc.
+                Scan s = new Scan();
+                s.addColumn(Bytes.toBytes("myLittleFamily"), Bytes
+                                .toBytes("someQualifier"));
+                ResultScanner scanner = table.getScanner(s);
+                try {
+                        // Scanners return Result instances.
+                        // Now, for the actual iteration. One way is to use a while loop
+                        // like so:
+                        for (Result rr = scanner.next(); rr != null; rr = scanner.next()) {
+                                // print out the row we found and the columns we were looking
+                                // for
+                                System.out.println("Found row: " + rr);
+                        }
+                        // The other approach is to use a foreach loop. Scanners are
+                        // iterable!
+                        // for (Result rr : scanner) {
+                        // System.out.println("Found row: " + rr);
+                        // }
+                } finally {
+                        // Make sure you close your scanners when you are done!
+                        // Thats why we have it inside a try/finally clause
+                        scanner.close();
+                }
+        }
+}
+}}}