source: sample/hadoop-0.16/tw/org/nchc/demo/DemoWordCount.java @ 72

Last change on this file since 72 was 27, checked in by waue, 16 years ago

test!

File size: 3.4 KB
Line 
1/**
2 * Program: HBaseRecordPro.java
3 * Editor: Waue Chen
4 * From :  NCHC. Taiwn
5 * Last Update Date: 07/02/2008
6 */
7/*
8 * Cloud9: A MapReduce Library for Hadoop
9 */
10
11package tw.org.nchc.demo;
12
13import java.io.IOException;
14import java.util.Iterator;
15import java.util.StringTokenizer;
16
17import org.apache.hadoop.fs.FileSystem;
18import org.apache.hadoop.fs.Path;
19import org.apache.hadoop.io.IntWritable;
20import org.apache.hadoop.io.LongWritable;
21import org.apache.hadoop.io.Text;
22import org.apache.hadoop.mapred.JobClient;
23import org.apache.hadoop.mapred.JobConf;
24import org.apache.hadoop.mapred.MapReduceBase;
25import org.apache.hadoop.mapred.Mapper;
26import org.apache.hadoop.mapred.OutputCollector;
27import org.apache.hadoop.mapred.Reducer;
28import org.apache.hadoop.mapred.Reporter;
29
30/**
31 * <p>Simple word count demo. Counts words in the Bible+Shakespeare sample
32 * collection. Expected trace of MapReduce operation:</p>
33 *
34 * <pre>
35 * Map input records=156215
36 * Map output records=1734298
37 * Map input bytes=9068074
38 * Map output bytes=15919397
39 * Combine input records=1734298
40 * Combine output records=135372
41 * Reduce input groups=41788
42 * Reduce input records=135372
43 * Reduce output records=41788
44 * </pre>
45 *
46 */
47public class DemoWordCount {
48
49  // mapper: emits (token, 1) for every word occurrence
50  private static class MapClass extends MapReduceBase implements
51      Mapper<LongWritable, Text, Text, IntWritable> {
52
53    // reuse objects to save overhead of object creation
54    private final static IntWritable one = new IntWritable(1);
55    private Text word = new Text();
56
57    public void map(LongWritable key, Text value,
58        OutputCollector<Text, IntWritable> output, Reporter reporter)
59        throws IOException {
60      String line = ((Text) value).toString();
61      StringTokenizer itr = new StringTokenizer(line);
62      while (itr.hasMoreTokens()) {
63        word.set(itr.nextToken());
64        output.collect(word, one);
65      }
66    }
67  }
68
69  // reducer: sums up all the counts
70  private static class ReduceClass extends MapReduceBase implements
71      Reducer<Text, IntWritable, Text, IntWritable> {
72
73    // reuse objects
74    private final static IntWritable SumValue = new IntWritable();
75
76    public void reduce(Text key, Iterator<IntWritable> values,
77        OutputCollector<Text, IntWritable> output, Reporter reporter)
78        throws IOException {
79      // sum up values
80      int sum = 0;
81      while (values.hasNext()) {
82        sum += values.next().get();
83      }
84      SumValue.set(sum);
85      output.collect(key, SumValue);
86    }
87  }
88
89  private DemoWordCount() {
90  }
91 
92  /**
93   * Runs the demo.
94   */
95  public static void main(String[] args) throws IOException {
96    String filename = "/user/waue/test/132.txt";
97    String outputPath = "sample-counts";
98    int mapTasks = 20;
99    int reduceTasks = 1;
100
101    JobConf conf = new JobConf(DemoWordCount.class);
102    conf.setJobName("wordcount");
103
104    conf.setNumMapTasks(mapTasks);
105    conf.setNumReduceTasks(reduceTasks);
106
107    conf.setInputPath(new Path(filename));
108
109    conf.setOutputKeyClass(Text.class);
110    conf.setOutputValueClass(IntWritable.class);
111
112    conf.setOutputPath(new Path(outputPath));
113    conf.setMapperClass(MapClass.class);
114    conf.setCombinerClass(ReduceClass.class);
115    conf.setReducerClass(ReduceClass.class);
116   
117    // Delete the output directory if it exists already
118    Path outputDir = new Path(outputPath);
119    FileSystem.get(conf).delete(outputDir);
120    JobClient.runJob(conf);
121  }
122}
Note: See TracBrowser for help on using the repository browser.