Context Navigation

WordCount2.java @ 47

Last change on this file since 47 was 25, checked in by waue, 16 years ago
downgrade from 0.17 to 0.16 test for work -> not yet
File size: 4.2 KB

Rev	Line
[24]	1	import java.io.BufferedReader;
	2	import java.io.FileReader;
	3	import java.io.IOException;
	4	import java.util.HashSet;
	5	import java.util.Iterator;
	6	import java.util.Set;
	7	import java.util.StringTokenizer;
	8
	9	import org.apache.hadoop.conf.Configured;
	10	import org.apache.hadoop.filecache.DistributedCache;
	11	import org.apache.hadoop.fs.FileSystem;
	12	import org.apache.hadoop.fs.Path;
	13	import org.apache.hadoop.io.IntWritable;
	14	import org.apache.hadoop.io.LongWritable;
	15	import org.apache.hadoop.io.Text;
	16	import org.apache.hadoop.mapred.JobClient;
	17	import org.apache.hadoop.mapred.JobConf;
	18	import org.apache.hadoop.mapred.MapReduceBase;
	19	import org.apache.hadoop.mapred.Mapper;
	20	import org.apache.hadoop.mapred.OutputCollector;
	21	import org.apache.hadoop.mapred.Reducer;
	22	import org.apache.hadoop.mapred.Reporter;
	23	import org.apache.hadoop.util.StringUtils;
	24
	25	public class WordCount2 extends Configured {
	26
	27	public static class Map extends MapReduceBase implements
	28	Mapper<LongWritable, Text, Text, IntWritable> {
	29
	30	static enum Counters {
	31	INPUT_WORDS
	32	}
	33
	34	private final static IntWritable one = new IntWritable(1);
	35
	36	private Text word = new Text();
	37
	38	private boolean caseSensitive = true;
	39
	40	private Set<String> patternsToSkip = new HashSet<String>();
	41
	42	private long numRecords = 0;
	43
	44	private String inputFile;
	45
	46	public void configure(JobConf job) {
	47	caseSensitive = job.getBoolean("wordcount.case.sensitive", true);
	48	inputFile = job.get("map.input.file");
	49
	50	if (job.getBoolean("wordcount.skip.patterns", false)) {
	51	Path[] patternsFiles = new Path[0];
	52	try {
	53	patternsFiles = DistributedCache.getLocalCacheFiles(job);
	54	} catch (IOException ioe) {
	55	System.err
	56	.println("Caught exception while getting cached files: "
	57	+ StringUtils.stringifyException(ioe));
	58	}
	59	for (Path patternsFile : patternsFiles) {
	60	parseSkipFile(patternsFile);
	61	}
	62	}
	63	}
	64
	65	private void parseSkipFile(Path patternsFile) {
	66	try {
	67	BufferedReader fis = new BufferedReader(new FileReader(
	68	patternsFile.toString()));
	69	String pattern = null;
	70	while ((pattern = fis.readLine()) != null) {
	71	patternsToSkip.add(pattern);
	72	}
	73	} catch (IOException ioe) {
	74	System.err
	75	.println("Caught exception while parsing the cached file '"
	76	+ patternsFile
	77	+ "' : "
	78	+ StringUtils.stringifyException(ioe));
	79	}
	80	}
	81
	82	public void map(LongWritable key, Text value,
	83	OutputCollector<Text, IntWritable> output, Reporter reporter)
	84	throws IOException {
	85	String line = (caseSensitive) ? value.toString() : value.toString()
	86	.toLowerCase();
	87
	88	for (String pattern : patternsToSkip) {
	89	line = line.replaceAll(pattern, "");
	90	}
	91
	92	StringTokenizer tokenizer = new StringTokenizer(line);
	93	while (tokenizer.hasMoreTokens()) {
	94	word.set(tokenizer.nextToken());
	95	output.collect(word, one);
	96	reporter.incrCounter(Counters.INPUT_WORDS, 1);
	97	}
	98
	99	if ((++numRecords % 100) == 0) {
	100	reporter.setStatus("Finished processing " + numRecords
	101	+ " records " + "from the input file: " + inputFile);
	102	}
	103	}
	104	}
	105
	106	public static class Reduce extends MapReduceBase implements
	107	Reducer<Text, IntWritable, Text, IntWritable> {
	108	public void reduce(Text key, Iterator<IntWritable> values,
	109	OutputCollector<Text, IntWritable> output, Reporter reporter)
	110	throws IOException {
	111	int sum = 0;
	112	while (values.hasNext()) {
	113	sum += values.next().get();
	114	}
	115	output.collect(key, new IntWritable(sum));
	116	}
	117	}
	118
	119	public static void main(String[] args) throws IOException {
	120	String filename = "/user/waue/input/";
	121	String outputPath = "sample-counts";
	122	int mapTasks = 20;
	123	int reduceTasks = 1;
	124
	125	JobConf conf = new JobConf(WordCount2.class);
	126	conf.setJobName("wordcount");
	127
	128	conf.setNumMapTasks(mapTasks);
	129	conf.setNumReduceTasks(reduceTasks);
	130
[25]	131	conf.setInputPath(new Path(filename));
	132
[24]	133	conf.setOutputKeyClass(Text.class);
	134	conf.setOutputValueClass(IntWritable.class);
	135
[25]	136	conf.setOutputPath(new Path(outputPath));
[24]	137
	138
[25]	139
[24]	140	conf.setMapperClass(Map.class);
	141	conf.setCombinerClass(Reduce.class);
	142	conf.setReducerClass(Reduce.class);
	143
	144	// Delete the output directory if it exists already
	145	Path outputDir = new Path(outputPath);
[25]	146	FileSystem.get(conf).delete(outputDir);
[24]	147	JobClient.runJob(conf);
	148	}
	149	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: sample/hadoop-0.16/WordCount2.java @ 47

Download in other formats: