Context Navigation

WordCount2.java @ 109

Last change on this file since 109 was 25, checked in by waue, 16 years ago
downgrade from 0.17 to 0.16 test for work -> not yet
File size: 4.2 KB

Line
1	import java.io.BufferedReader;
2	import java.io.FileReader;
3	import java.io.IOException;
4	import java.util.HashSet;
5	import java.util.Iterator;
6	import java.util.Set;
7	import java.util.StringTokenizer;
8
9	import org.apache.hadoop.conf.Configured;
10	import org.apache.hadoop.filecache.DistributedCache;
11	import org.apache.hadoop.fs.FileSystem;
12	import org.apache.hadoop.fs.Path;
13	import org.apache.hadoop.io.IntWritable;
14	import org.apache.hadoop.io.LongWritable;
15	import org.apache.hadoop.io.Text;
16	import org.apache.hadoop.mapred.JobClient;
17	import org.apache.hadoop.mapred.JobConf;
18	import org.apache.hadoop.mapred.MapReduceBase;
19	import org.apache.hadoop.mapred.Mapper;
20	import org.apache.hadoop.mapred.OutputCollector;
21	import org.apache.hadoop.mapred.Reducer;
22	import org.apache.hadoop.mapred.Reporter;
23	import org.apache.hadoop.util.StringUtils;
24
25	public class WordCount2 extends Configured {
26
27	public static class Map extends MapReduceBase implements
28	Mapper<LongWritable, Text, Text, IntWritable> {
29
30	static enum Counters {
31	INPUT_WORDS
32	}
33
34	private final static IntWritable one = new IntWritable(1);
35
36	private Text word = new Text();
37
38	private boolean caseSensitive = true;
39
40	private Set<String> patternsToSkip = new HashSet<String>();
41
42	private long numRecords = 0;
43
44	private String inputFile;
45
46	public void configure(JobConf job) {
47	caseSensitive = job.getBoolean("wordcount.case.sensitive", true);
48	inputFile = job.get("map.input.file");
49
50	if (job.getBoolean("wordcount.skip.patterns", false)) {
51	Path[] patternsFiles = new Path[0];
52	try {
53	patternsFiles = DistributedCache.getLocalCacheFiles(job);
54	} catch (IOException ioe) {
55	System.err
56	.println("Caught exception while getting cached files: "
57	+ StringUtils.stringifyException(ioe));
58	}
59	for (Path patternsFile : patternsFiles) {
60	parseSkipFile(patternsFile);
61	}
62	}
63	}
64
65	private void parseSkipFile(Path patternsFile) {
66	try {
67	BufferedReader fis = new BufferedReader(new FileReader(
68	patternsFile.toString()));
69	String pattern = null;
70	while ((pattern = fis.readLine()) != null) {
71	patternsToSkip.add(pattern);
72	}
73	} catch (IOException ioe) {
74	System.err
75	.println("Caught exception while parsing the cached file '"
76	+ patternsFile
77	+ "' : "
78	+ StringUtils.stringifyException(ioe));
79	}
80	}
81
82	public void map(LongWritable key, Text value,
83	OutputCollector<Text, IntWritable> output, Reporter reporter)
84	throws IOException {
85	String line = (caseSensitive) ? value.toString() : value.toString()
86	.toLowerCase();
87
88	for (String pattern : patternsToSkip) {
89	line = line.replaceAll(pattern, "");
90	}
91
92	StringTokenizer tokenizer = new StringTokenizer(line);
93	while (tokenizer.hasMoreTokens()) {
94	word.set(tokenizer.nextToken());
95	output.collect(word, one);
96	reporter.incrCounter(Counters.INPUT_WORDS, 1);
97	}
98
99	if ((++numRecords % 100) == 0) {
100	reporter.setStatus("Finished processing " + numRecords
101	+ " records " + "from the input file: " + inputFile);
102	}
103	}
104	}
105
106	public static class Reduce extends MapReduceBase implements
107	Reducer<Text, IntWritable, Text, IntWritable> {
108	public void reduce(Text key, Iterator<IntWritable> values,
109	OutputCollector<Text, IntWritable> output, Reporter reporter)
110	throws IOException {
111	int sum = 0;
112	while (values.hasNext()) {
113	sum += values.next().get();
114	}
115	output.collect(key, new IntWritable(sum));
116	}
117	}
118
119	public static void main(String[] args) throws IOException {
120	String filename = "/user/waue/input/";
121	String outputPath = "sample-counts";
122	int mapTasks = 20;
123	int reduceTasks = 1;
124
125	JobConf conf = new JobConf(WordCount2.class);
126	conf.setJobName("wordcount");
127
128	conf.setNumMapTasks(mapTasks);
129	conf.setNumReduceTasks(reduceTasks);
130
131	conf.setInputPath(new Path(filename));
132
133	conf.setOutputKeyClass(Text.class);
134	conf.setOutputValueClass(IntWritable.class);
135
136	conf.setOutputPath(new Path(outputPath));
137
138
139
140	conf.setMapperClass(Map.class);
141	conf.setCombinerClass(Reduce.class);
142	conf.setReducerClass(Reduce.class);
143
144	// Delete the output directory if it exists already
145	Path outputDir = new Path(outputPath);
146	FileSystem.get(conf).delete(outputDir);
147	JobClient.runJob(conf);
148	}
149	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: sample/hadoop-0.16/WordCount2.java @ 109

Download in other formats: