Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

DemoWordCountTuple.java @ 72

Last change on this file since 72 was 20, checked in by waue, 16 years ago
將改完的 hadoop 0.17版package 放來備份目前繼續開發 hadoop 0.16 + hbase 1.3
File size: 5.5 KB

Line
1	/**
2	* Program: HBaseRecordPro.java
3	* Editor: Waue Chen
4	* From : NCHC. Taiwn
5	* Last Update Date: 07/02/2008
6	* Upgrade to 0.17
7	*/
8	/*
9	* Cloud9: A MapReduce Library for Hadoop
10	*/
11
12	package tw.org.nchc.demo;
13
14	import java.io.IOException;
15	import java.util.Iterator;
16	import java.util.StringTokenizer;
17
18	import org.apache.hadoop.fs.Path;
19	import org.apache.hadoop.io.IntWritable;
20	import org.apache.hadoop.io.LongWritable;
21	import org.apache.hadoop.mapred.JobClient;
22	import org.apache.hadoop.mapred.JobConf;
23	import org.apache.hadoop.mapred.MapReduceBase;
24	import org.apache.hadoop.mapred.Mapper;
25	import org.apache.hadoop.mapred.OutputCollector;
26	import org.apache.hadoop.mapred.Reducer;
27	import org.apache.hadoop.mapred.Reporter;
28	import org.apache.hadoop.mapred.SequenceFileInputFormat;
29	import org.apache.hadoop.mapred.SequenceFileOutputFormat;
30
31	import tw.org.nchc.code.Convert;
32	import tw.org.nchc.tuple.Schema;
33	import tw.org.nchc.tuple.Tuple;
34
35	/**
36	* <p>
37	* Demo that illustrates the use of the tuple library ({@link Tuple} class).
38	* Input comes from Bible+Shakespeare sample collection, encoded as single-field
39	* tuples; see {@link DemoPackRecords}. Sample of final output:
40	* </p>
41	*
42	* <pre>
43	* ...
44	* (admirable, 0) 9
45	* (admirable, 1) 6
46	* (admiral, 0) 2
47	* (admiral, 1) 4
48	* (admiration, 0) 10
49	* (admiration, 1) 6
50	* (admire, 0) 5
51	* (admire, 1) 3
52	* (admired, 0) 12
53	* (admired, 1) 7
54	* ...
55	* </pre>
56	*
57	* <p>
58	* The first field of the key tuple contains a token, the second field indicates
59	* whether it was found on a even-length or odd-length line. The value is the
60	* count of the tuple occurrences in the collection. In the MapReduce cycle,
61	* output keys consist of tuples (Token, EvenOrOdd). The second field of the
62	* tuple indicates whether the token was found on a line with an even or an odd
63	* number of characters. Values consist of counts of tuple occurrences. Expected
64	* trace of the demo:
65	* </p>
66	*
67	* <pre>
68	* Map input records=156215
69	* Map output records=1734298
70	* Map input bytes=13118917
71	* Map output bytes=66214039
72	* Combine input records=1734298
73	* Combine output records=192045
74	* Reduce input groups=59225
75	* Reduce input records=192045
76	* Reduce output records=59225
77	* </pre>
78	*
79	* <p>
80	* Obviously, this isn't a particularly meaningful program, but does illustrate
81	* the use of the {@link Tuple} class.
82	* </p>
83	*/
84	public class DemoWordCountTuple {
85
86	// create the schema for the tuple that will serve as the key
87	private static final Schema KEY_SCHEMA = new Schema();
88
89	// define the schema statically
90	static {
91	KEY_SCHEMA.addField("Token", String.class, "");
92	KEY_SCHEMA.addField("EvenOrOdd", Integer.class, new Integer(1));
93	}
94
95	// mapper that emits tuple as the key, and value '1' for each occurrence
96	private static class MapClass extends MapReduceBase implements
97	Mapper<LongWritable, Tuple, Tuple, IntWritable> {
98
99	// define value '1' statically so we can reuse the object, i.e., avoid
100	// unnecessary object creation
101	private final static IntWritable one = new IntWritable(1);
102
103	// once again, reuse tuples if possible
104	private Tuple tupleOut = KEY_SCHEMA.instantiate();
105
106	public void map(LongWritable key, Tuple tupleIn,
107	OutputCollector<Tuple, IntWritable> output, Reporter reporter)
108	throws IOException {
109
110	// the input value is a tuple; get field 0
111	// see DemoPackRecords of how input SequenceFile is generated
112	String line = (String) tupleIn.get(0);
113	StringTokenizer itr = new StringTokenizer(line);
114	while (itr.hasMoreTokens()) {
115	String token = itr.nextToken();
116
117	// put new values into the tuple
118	tupleOut.set("Token", token);
119	tupleOut.set("EvenOrOdd", line.length() % 2);
120
121	// emit key-value pair
122	output.collect(tupleOut, one);
123	}
124	}
125	}
126
127	// reducer counts up tuple occurrences
128	private static class ReduceClass extends MapReduceBase implements
129	Reducer<Tuple, IntWritable, Tuple, IntWritable> {
130	private final static IntWritable SumValue = new IntWritable();
131
132	public synchronized void reduce(Tuple tupleKey,
133	Iterator<IntWritable> values,
134	OutputCollector<Tuple, IntWritable> output, Reporter reporter)
135	throws IOException {
136	// sum values
137	int sum = 0;
138	while (values.hasNext()) {
139	sum += values.next().get();
140	}
141
142	// keep original tuple key, emit sum of counts as value
143	SumValue.set(sum);
144	output.collect(tupleKey, SumValue);
145	}
146	}
147
148	// dummy constructor
149	private DemoWordCountTuple() {
150	}
151
152	/**
153	* Runs the demo.
154	*/
155	public static void main(String[] args) throws IOException {
156	String inPath = "/shared/sample-input/bible+shakes.nopunc.packed";
157	String outputPath = "word-counts-tuple";
158	int numMapTasks = 20;
159	int numReduceTasks = 20;
160
161	JobConf conf = new JobConf(DemoWordCountTuple.class);
162	conf.setJobName("wordcount");
163
164	conf.setNumMapTasks(numMapTasks);
165	conf.setNumReduceTasks(numReduceTasks);
166	// 0.16
167	// conf.setInputPath(new Path(inPath));
168	Convert.setInputPath(conf, new Path(inPath));
169	conf.setInputFormat(SequenceFileInputFormat.class);
170
171	// conf.setOutputPath(new Path(outputPath));
172	Convert.setInputPath(conf, new Path(outputPath));
173	conf.setOutputKeyClass(Tuple.class);
174	conf.setOutputValueClass(IntWritable.class);
175	conf.setOutputFormat(SequenceFileOutputFormat.class);
176
177	conf.setMapperClass(MapClass.class);
178	conf.setCombinerClass(ReduceClass.class);
179	conf.setReducerClass(ReduceClass.class);
180
181	JobClient.runJob(conf);
182	}
183	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: sample/hadoop-0.17/tw/org/nchc/demo/DemoWordCountTuple.java @ 72

Download in other formats: