Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

WordCountFromHBase.java @ 20

Last change on this file since 20 was 20, checked in by waue, 16 years ago
將改完的 hadoop 0.17版package 放來備份目前繼續開發 hadoop 0.16 + hbase 1.3
File size: 6.3 KB

Line
1	/**
2	* Program: WordCountFromHBase.java
3	* Editor: Waue Chen
4	* From : NCHC. Taiwn
5	* Last Update Date: 07/02/2008
6	* Upgrade to 0.17
7	*/
8
9	/**
10	* Purpose :
11	* Word counting from Hbase then store result in Hadoop file system
12	*
13	* HowToUse :
14	* Make sure Hadoop file system are running and HBase has correct data.
15	* Suggest to run WordCountIntoHBase first.
16	* finally, modify these setup parameters and run.
17	*
18	* Check Result:
19	*
20	* inspect http://localhost:50070 by web explorer
21	*/
22
23	package tw.org.nchc.code;
24
25	import java.io.IOException;
26	import java.util.Iterator;
27	import java.util.StringTokenizer;
28
29	import org.apache.hadoop.fs.FileSystem;
30	import org.apache.hadoop.fs.Path;
31	import org.apache.hadoop.hbase.HStoreKey;
32	import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
33	import org.apache.hadoop.hbase.mapred.TableInputFormat;
34	import org.apache.hadoop.hbase.mapred.TableMap;
35	import org.apache.hadoop.io.IntWritable;
36	import org.apache.hadoop.io.MapWritable;
37	import org.apache.hadoop.io.Text;
38	import org.apache.hadoop.mapred.JobClient;
39	import org.apache.hadoop.mapred.JobConf;
40	import org.apache.hadoop.mapred.MapReduceBase;
41	import org.apache.hadoop.mapred.OutputCollector;
42	import org.apache.hadoop.mapred.Reducer;
43	import org.apache.hadoop.mapred.Reporter;
44	@SuppressWarnings("unused")
45
46	public class WordCountFromHBase {
47	/* setup parameters */
48	// set the output path
49	static String outputPath = "counts2";
50
51	// org.apache.hadoop.hbase.mapred.TableMap<K,V> \
52	// TableMap<K extends org.apache.hadoop.io.WritableComparable, \
53	// V extends org.apache.hadoop.io.Writable> \
54	// Scan an HBase table to sort by a specified sort column. \
55	// If the column does not exist, the record is not passed to Reduce.;
56	private static class MapClass extends TableMap<Text, IntWritable> {
57
58	// set one as (IntWritable)1
59	private final static IntWritable one = new IntWritable(1);
60	// set column
61	private final static Text textcol = new Text(WordCountIntoHBase.colstr);
62	private Text word = new Text();
63	// TableMap is a interface, map is a abstract method. now, we should \
64	// inprement map() at here, format is : \
65	// map(HStoreKey key, MapWritable value, \
66	// OutputCollector<K,V> output, Reporter reporter) ;
67	// Call a user defined function on a single HBase record, \
68	// represented by a key and its associated record value. ;
69	public void map(HStoreKey key, MapWritable cols,
70	OutputCollector<Text, IntWritable> output, Reporter reporter)
71	throws IOException {
72	//
73	// The first get() is : Writable <- get(Object key) \
74	// get in interface Map<Writable,Writable> ;
75	// Use ImmutableBytesWritable to downcast Writable \
76	// The second get() is : byte[] <- get() \
77	// Get the data from the BytesWritable. ;
78	// Text.decode is parse UTF-8 code to a String ;
79	// per "line" is per row data in HTable
80	String line = Text.decode( ((ImmutableBytesWritable) cols.get(textcol) )
81	.get() );
82
83	//let us know what is "line"
84	/*
85	RandomAccessFile raf =
86	new RandomAccessFile("/home/waue/mr-result.txt","rw");
87	raf.seek(raf.length()); // move pointer to end
88	raf.write(("\n"+line).getBytes());
89	raf.close();
90	*///end
91	// the result is the contents of merged files "
92
93	//StringTokenizer will divide a line into a word
94	StringTokenizer itr = new StringTokenizer(line);
95	// set every word as one
96	while (itr.hasMoreTokens()) {
97	// nextToken will return this value in String and point to next \
98	// Text.set() = Set to contain the contents of a string.
99	word.set(itr.nextToken());
100	// OutputCollector.collect = collect(K key, V value) \
101	// Adds a key/value pair to the output.
102	output.collect(word, one);
103	}
104	}
105	}
106
107	// reducer: sums up all the counts
108	private static class ReduceClass extends MapReduceBase implements
109	Reducer<Text, IntWritable, Text, IntWritable> {
110
111	// reuse objects
112	private final static IntWritable SumValue = new IntWritable();
113
114	// this sample's reduce() format is the same as map() \
115	// reduce is a method waiting for implement \
116	// four type in this sample is (Text , Iterator<IntWritable>, \
117	// OutputCollector<Text, IntWritable> , Reporter ) ;
118	public void reduce(Text key, Iterator<IntWritable> values,
119	OutputCollector<Text, IntWritable> output, Reporter reporter)
120	throws IOException {
121	// sum up value
122	int sum = 0;
123	// "key" is word , "value" is sum
124	// why values.hasNext(), not key.hasNext()
125	while (values.hasNext()) {
126	// next() will return this value and pointer to next event \
127	// IntWritable.get() will transfer IntWritable to Int
128	sum += values.next().get();
129	}
130	// IntWritable.set(int) will transfer Int to IntWritable
131	SumValue.set(sum);
132	// hense we set outputPath in main, the output.collect will put
133	// data in Hadoop
134	output.collect(key, SumValue);
135	}
136	}
137
138	private WordCountFromHBase() {
139	}
140
141	/**
142	* Runs the demo.
143	*/
144	public static void main(String[] args) throws IOException {
145
146
147	int mapTasks = 1;
148	int reduceTasks = 1;
149	// initialize job;
150	JobConf conf = new JobConf(WordCountFromHBase.class);
151	// TableMap.initJob will build HBase code \
152	// "org.apache.hadoop.hbase.mapred.TableMap".initJob \
153	// (Table_name,column_string,Which_class_will_use,job_configure);
154	TableMap.initJob(WordCountIntoHBase.Table_Name,
155	WordCountIntoHBase.colstr, MapClass.class, conf);
156	conf.setJobName(WordCountIntoHBase.Table_Name + "store");
157	conf.setNumMapTasks(mapTasks);
158	conf.setNumReduceTasks(reduceTasks);
159
160	//Set the key class for the job output data.
161	conf.setOutputKeyClass(Text.class);
162	//Set the value class for job outputs.
163	conf.setOutputValueClass(IntWritable.class);
164	// MapperClass,CombinerClass,ReducerClass are essential
165	conf.setMapperClass(MapClass.class);
166	conf.setCombinerClass(ReduceClass.class);
167	conf.setReducerClass(ReduceClass.class);
168	// input is Hbase format => TableInputFormat
169	conf.setInputFormat(TableInputFormat.class);
170	// 0.16
171	// conf.setOutputPath(new Path(outputPath));
172	Convert.setOutputPath(conf, new Path(outputPath));
173	// delete the old path with the same name
174	FileSystem.get(conf).delete(new Path(outputPath),true);
175	JobClient.runJob(conf);
176	}
177	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: sample/hadoop-0.17/tw/org/nchc/code/WordCountFromHBase.java @ 20

Download in other formats: