wiki:waue/2010/0125

hbase 程式範例

範例一

package hbase020;
// WordCountHBase 
//說明: 
//  此程式碼將輸入路徑的檔案內的字串取出做字數統計
//  再將結果塞回HTable內
//
//運算方法:
//  將此程式運作在hadoop 0.20 平台上,用(參考2)的方法加入hbase參數後,將此程式碼打包成XX.jar 
//  執行:
//  ---------------------------
//  hadoop jar XX.jar WordCountHBase <hdfs_input>
//  ---------------------------
//
//結果:
// ---------------------------
//  $ hbase shell
//  > scan 'wordcount'
//  ROW   COLUMN+CELL 
//  am    column=content:count, timestamp=1264406245488, value=1
//  chen  column=content:count, timestamp=1264406245488, value=1 
//  hi,   column=content:count, timestamp=1264406245488, value=2
//  ......(略)
// ---------------------------
//注意:
//1.  在hdfs 上來源檔案的路徑為 "/user/$YOUR_NAME/input"
//  請注意必須先放資料到此hdfs上的資料夾內,且此資料夾內只能放檔案,不可再放資料夾
//2.  運算完後,程式將執行結果放在hbase的wordcount資料表內
// 
//參考:
//  1.程式碼改編於: http://blog.ring.idv.tw/comment.ser?i=337
//  2.hbase 運作 mapreduce 程式的方法參考於:http://wiki.apache.org/hadoop/Hbase/MapReduce

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

public class WordCountHBase {
  public static class HtMap extends
      Mapper<LongWritable, Text, Text, IntWritable> {
    private IntWritable one = new IntWritable(1);

    public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      // 輸入的字串先轉換成小寫再用空白區隔
      String s[] = value.toString().toLowerCase().trim().split(" ");
      for (String m : s) {
        // 寫入到輸出串流
        context.write(new Text(m), one);
      }
    }
  }
  // TableReducer<KEYIN,VALUEIN,KEYOUT> 
  // 原本為 TableReducer<Text, IntWritable, NullWritable > 
  // 但在此改成 LongWritable 也可以
  // 因此證明在此的Class可以很多,org.apache.hadoop.io.* 內有write()的Writable class應該皆可
  public static class HtReduce extends
      TableReducer<Text, IntWritable,LongWritable > {
    
    public void reduce(Text key, Iterable<IntWritable> values,
        Context context) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable i : values) {
        sum += i.get();
      }
      
      // org.apache.hadoop.hbase.client.Put 
      // Used to perform Put operations for a single row.
      // new Put(byte[] row) 
      Put put = new Put(Bytes.toBytes(key.toString()));
      
      
      // add(byte[] family, byte[] qualifier, byte[] value)
      // 在main設定output format class 為 TableOutputFormat  
      // TableReducer 內有定義 output Key class 必須為 Put 或 Delete 
      put.add(Bytes.toBytes("content"), Bytes.toBytes("count"), Bytes
          .toBytes(String.valueOf(sum)));
            
      //NullWritable.get(): Returns the single instance of this class.
      //NullWritable.write(): Serialize the fields of this object to out.
      context.write(new LongWritable(), put);
      // context.write(NullWritable.get(), put)
    }
  }

  public static void createHBaseTable(String tablename) throws IOException {
    // HTableDescriptor contains the name of an HTable, and its column
    // families
    // HTableDescriptor 用來描述table的屬性
    HTableDescriptor htd = new HTableDescriptor(tablename);
    // HColumnDescriptor HColumnDescriptor contains information about a
    // column family such as the number of versions, compression settings,
    // etc.
    // HTableDescriptor 透過 add() 方法來加入Column family
    htd.addFamily(new HColumnDescriptor("content:"));
    // HBaseConfiguration 能接收 hbase-site.xml 的設定值
    HBaseConfiguration config = new HBaseConfiguration();
    // 檔案的操作則使用 HBaseAdmin
    HBaseAdmin admin = new HBaseAdmin(config);
    // 檢查
    if (admin.tableExists(tablename)) {
      // 停止
      admin.disableTable(tablename);
      // 刪除
      admin.deleteTable(tablename);
    }
    System.out.println("create new table: " + tablename);
    // 建立
    admin.createTable(htd);
  }

  public static void main(String args[]) throws Exception {
    // debug
    String[] argv = { "/user/waue/input" };
    args = argv;
    String input = args[0];

    String tablename = "wordcount";

    Configuration conf = new Configuration();
    // OUTPUT_TABLE = "hbase.mapred.outputtable"
    // conf.set 用於設定 如 core-site.xml 的 name 與 value
    // 告訴程式 hbase.mapred.outputtable --> wordcount
    conf.set(TableOutputFormat.OUTPUT_TABLE, tablename);
    // 建立hbase 的table 否則沒先建立會出錯
    createHBaseTable(tablename);

    Job job = new Job(conf, "WordCount table with " + input);

    job.setJarByClass(WordCountHBase.class);
    job.setNumReduceTasks(1);
    job.setMapperClass(HtMap.class);
    job.setReducerClass(HtReduce.class);
    // 此範例的輸出為 <Text,IntWritable> 因此其實可以省略宣告 
    // set{Map|Reduce}Output{Key|Value}Class()
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    // InputFormat 只有一個子介面
    // FileInputFormat <-(SequenceFileInputFormat,TextInputFormat)
    // 其中TextInputFormat 最常用 ,預設輸入為 LongWritable,Text
    // 另外HBase 則設計了一個子類別 TableInputFormat
    job.setInputFormatClass(TextInputFormat.class);
    // TAbleOutputFormat
    // 宣告此行則可使 reduce 輸出為 HBase 的table
    job.setOutputFormatClass(TableOutputFormat.class);

    // 原本設定輸入檔案為 Config.setInputPath(Path) 卻改為
    // FileInputFormat.addInputPath(Job, Path()) 的設計,
    // 猜測應該是考慮某些檔案操作並不需要跑mapreduce的Job,因此提到外面
    FileInputFormat.addInputPath(job, new Path(input));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

範例二

package hbase020;



// 說明:
// 此程式首先建立table, 然後put一個資料後
// 用get 方法取得資料,最後用scan方法取出整個資料表
// 
// 參考: 程式碼改編自
// http://hadoop.apache.org/hbase/docs/current/api/org/apache/.接.
//  hadoop/hbase/client/package-summary.html#package_description

import java.io.IOException;

import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.util.Bytes;

// Class that has nothing but a main.
// Does a Put, Get and a Scan against an hbase table.
public class MyLittleHBaseClient {
  
  private static String table_name = "mytable";
  
  private static String the_row = "myLittleRow";
  
  private static String column_family = "family";
  
  private static String column_qualifier = "someQualifier";
  
  private static String cell_value = "somvalue";
  
  
  
  public static void createHBaseTable(String tablename) throws IOException {
    // HTableDescriptor contains the name of an HTable, and its column
    // families
    // HTableDescriptor 用來描述table的屬性
    HTableDescriptor htd = new HTableDescriptor(tablename);
    // HColumnDescriptor HColumnDescriptor contains information about a
    // column family such as the number of versions, compression settings,
    // etc.
    // HTableDescriptor 透過 add() 方法來加入Column family
    htd.addFamily(new HColumnDescriptor(column_family+":"));
    // HBaseConfiguration 能接收 hbase-site.xml 的設定值
    HBaseConfiguration config = new HBaseConfiguration();
    // 檔案的操作則使用 HBaseAdmin
    HBaseAdmin admin = new HBaseAdmin(config);
    // 檢查
    if (admin.tableExists(tablename)) {
      // 停止
      admin.disableTable(tablename);
      // 刪除
      admin.deleteTable(tablename);
    }
    System.out.println("create new table: " + tablename);
    // 建立
    admin.createTable(htd);
  }

  public static void main(String[] args) throws IOException {
    // You need a configuration object to tell the client where to connect.
    // When you create a HBaseConfiguration, it reads in whatever you've set
    // into your hbase-site.xml and in hbase-default.xml, as long as these
    // can be found on the CLASSPATH
    
    HBaseConfiguration config = new HBaseConfiguration();

    // create table automatically
    createHBaseTable(table_name);
    
    // This instantiates an HTable object that connects you to
    // the "myLittleHBaseTable" table.
    HTable table = new HTable(config, table_name);


    // To add to a row, use Put. A Put constructor takes the name of the row
    // you want to insert into as a byte array. In HBase, the Bytes class
    // has
    // utility for converting all kinds of java types to byte arrays. In the
    // below, we are converting the String "myLittleRow" into a byte array
    // to
    // use as a row key for our update. Once you have a Put instance, you
    // can
    // adorn it by setting the names of columns you want to update on the
    // row,
    // the timestamp to use in your update, etc.If no timestamp, the server
    // applies current time to the edits.
    Put p = new Put(Bytes.toBytes(the_row));

    // To set the value you'd like to update in the row 'myRow', specify the
    // column family, column qualifier, and value of the table cell you'd
    // like
    // to update. The column family must already exist in your table schema.
    // The qualifier can be anything. All must be specified as byte arrays
    // as
    // hbase is all about byte arrays. Lets pretend the table
    // 'myLittleHBaseTable' was created with a family 'myLittleFamily'.
    p.add(Bytes.toBytes(column_family), Bytes.toBytes(column_qualifier),
        Bytes.toBytes(cell_value));

    // Once you've adorned your Put instance with all the updates you want
    // to
    // make, to commit it do the following (The HTable#put method takes the
    // Put instance you've been building and pushes the changes you made
    // into
    // hbase)
    table.put(p);

    // Now, to retrieve the data we just wrote. The values that come back
    // are
    // Result instances. Generally, a Result is an object that will package
    // up
    // the hbase return into the form you find most palatable.
    Get g = new Get(Bytes.toBytes(the_row));
    Result r = table.get(g);
    byte[] value = r.getValue(Bytes.toBytes(column_family), Bytes
        .toBytes(column_qualifier));
    // If we convert the value bytes, we should get back 'Some Value', the
    // value we inserted at this location.
    String valueStr = Bytes.toString(value);
    System.out.println("GET: " + valueStr);

    // Sometimes, you won't know the row you're looking for. In this case,
    // you
    // use a Scanner. This will give you cursor-like interface to the
    // contents
    // of the table. To set up a Scanner, do like you did above making a Put
    // and a Get, create a Scan. Adorn it with column names, etc.
    Scan s = new Scan();
    s.addColumn(Bytes.toBytes(column_family), Bytes
        .toBytes(column_qualifier));
    ResultScanner scanner = table.getScanner(s);
    try {
      // Scanners return Result instances.
      // Now, for the actual iteration. One way is to use a while loop
      // like so:
      for (Result rr : scanner) {
        // print out the row we found and the columns we were looking
        // for
        System.out.println("Found row: " + rr);
      }

      // The other approach is to use a foreach loop. Scanners are
      // iterable!
      // for (Result rr : scanner) {
      // System.out.println("Found row: " + rr);
      // }
    } finally {
      // Make sure you close your scanners when you are done!
      // Thats why we have it inside a try/finally clause
      scanner.close();
    }
  }
}

Last modified 15 years ago Last modified on Jan 25, 2010, 5:13:29 PM