Context Navigation

source: sample/hadoop-0.16/WordCount2.java @ 24

Last change on this file since 24 was 24, checked in by waue, 16 years ago
upload test
File size: 4.4 KB

Rev	Line
[24]	1	import java.io.BufferedReader;
	2	import java.io.FileReader;
	3	import java.io.IOException;
	4	import java.util.HashSet;
	5	import java.util.Iterator;
	6	import java.util.Set;
	7	import java.util.StringTokenizer;
	8
	9	import org.apache.hadoop.conf.Configured;
	10	import org.apache.hadoop.filecache.DistributedCache;
	11	import org.apache.hadoop.fs.FileSystem;
	12	import org.apache.hadoop.fs.Path;
	13	import org.apache.hadoop.io.IntWritable;
	14	import org.apache.hadoop.io.LongWritable;
	15	import org.apache.hadoop.io.Text;
	16	import org.apache.hadoop.mapred.FileInputFormat;
	17	import org.apache.hadoop.mapred.FileOutputFormat;
	18	import org.apache.hadoop.mapred.JobClient;
	19	import org.apache.hadoop.mapred.JobConf;
	20	import org.apache.hadoop.mapred.MapReduceBase;
	21	import org.apache.hadoop.mapred.Mapper;
	22	import org.apache.hadoop.mapred.OutputCollector;
	23	import org.apache.hadoop.mapred.Reducer;
	24	import org.apache.hadoop.mapred.Reporter;
	25	import org.apache.hadoop.util.StringUtils;
	26
	27	public class WordCount2 extends Configured {
	28
	29	public static class Map extends MapReduceBase implements
	30	Mapper<LongWritable, Text, Text, IntWritable> {
	31
	32	static enum Counters {
	33	INPUT_WORDS
	34	}
	35
	36	private final static IntWritable one = new IntWritable(1);
	37
	38	private Text word = new Text();
	39
	40	private boolean caseSensitive = true;
	41
	42	private Set<String> patternsToSkip = new HashSet<String>();
	43
	44	private long numRecords = 0;
	45
	46	private String inputFile;
	47
	48	public void configure(JobConf job) {
	49	caseSensitive = job.getBoolean("wordcount.case.sensitive", true);
	50	inputFile = job.get("map.input.file");
	51
	52	if (job.getBoolean("wordcount.skip.patterns", false)) {
	53	Path[] patternsFiles = new Path[0];
	54	try {
	55	patternsFiles = DistributedCache.getLocalCacheFiles(job);
	56	} catch (IOException ioe) {
	57	System.err
	58	.println("Caught exception while getting cached files: "
	59	+ StringUtils.stringifyException(ioe));
	60	}
	61	for (Path patternsFile : patternsFiles) {
	62	parseSkipFile(patternsFile);
	63	}
	64	}
	65	}
	66
	67	private void parseSkipFile(Path patternsFile) {
	68	try {
	69	BufferedReader fis = new BufferedReader(new FileReader(
	70	patternsFile.toString()));
	71	String pattern = null;
	72	while ((pattern = fis.readLine()) != null) {
	73	patternsToSkip.add(pattern);
	74	}
	75	} catch (IOException ioe) {
	76	System.err
	77	.println("Caught exception while parsing the cached file '"
	78	+ patternsFile
	79	+ "' : "
	80	+ StringUtils.stringifyException(ioe));
	81	}
	82	}
	83
	84	public void map(LongWritable key, Text value,
	85	OutputCollector<Text, IntWritable> output, Reporter reporter)
	86	throws IOException {
	87	String line = (caseSensitive) ? value.toString() : value.toString()
	88	.toLowerCase();
	89
	90	for (String pattern : patternsToSkip) {
	91	line = line.replaceAll(pattern, "");
	92	}
	93
	94	StringTokenizer tokenizer = new StringTokenizer(line);
	95	while (tokenizer.hasMoreTokens()) {
	96	word.set(tokenizer.nextToken());
	97	output.collect(word, one);
	98	reporter.incrCounter(Counters.INPUT_WORDS, 1);
	99	}
	100
	101	if ((++numRecords % 100) == 0) {
	102	reporter.setStatus("Finished processing " + numRecords
	103	+ " records " + "from the input file: " + inputFile);
	104	}
	105	}
	106	}
	107
	108	public static class Reduce extends MapReduceBase implements
	109	Reducer<Text, IntWritable, Text, IntWritable> {
	110	public void reduce(Text key, Iterator<IntWritable> values,
	111	OutputCollector<Text, IntWritable> output, Reporter reporter)
	112	throws IOException {
	113	int sum = 0;
	114	while (values.hasNext()) {
	115	sum += values.next().get();
	116	}
	117	output.collect(key, new IntWritable(sum));
	118	}
	119	}
	120
	121	public static void main(String[] args) throws IOException {
	122	String filename = "/user/waue/input/";
	123	String outputPath = "sample-counts";
	124	int mapTasks = 20;
	125	int reduceTasks = 1;
	126
	127	JobConf conf = new JobConf(WordCount2.class);
	128	conf.setJobName("wordcount");
	129
	130	conf.setNumMapTasks(mapTasks);
	131	conf.setNumReduceTasks(reduceTasks);
	132
	133	// conf.setInputPath(new Path(filename));
	134	FileInputFormat.setInputPaths(conf,new Path(filename));
	135	conf.setOutputKeyClass(Text.class);
	136	conf.setOutputValueClass(IntWritable.class);
	137
	138	// conf.setOutputPath(new Path(outputPath));
	139	FileOutputFormat.setOutputPath( conf, new Path(filename));
	140
	141
	142	conf.setMapperClass(Map.class);
	143	conf.setCombinerClass(Reduce.class);
	144	conf.setReducerClass(Reduce.class);
	145
	146	// Delete the output directory if it exists already
	147	Path outputDir = new Path(outputPath);
	148	FileSystem.get(conf).delete(outputDir,true);
	149	JobClient.runJob(conf);
	150	}
	151	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: