{{{
#!html
<div style="text-align: center; color:#151B8D"><big style="font-weight: bold;"><big><big>
Hadoop 進階課程 
</big></big></big></div> <div style="text-align: center; color:#7E2217"><big style="font-weight: bold;"><big>
範例練習
</big></big></div>
}}}

[wiki:NCHCCloudCourse100928_4_EXM4 上一關 < ] 第五關 [wiki:NCHCCloudCourse100928_4_EXM6 > 下一關]

 = 說明 =
{{{
#!text
WordCountV20
說明： 
	用於字數統計，並且增加略過大小寫辨識、符號篩除等功能 [已全改為 hadoop 0.20 API ]

測試方法：
	將此程式運作在hadoop 0.20 平台上，執行：
	---------------------------
	hadoop jar WordCountV2.jar "/home/nchc/input" "/home/nchc/output-wc2" "-c"  "-skip" "/home/nchc/patterns/patterns.txt"
	---------------------------

注意：
1. 以在程式內設定<input> <output> 路徑為local 的 "/home/nchc/input" "/home/nchc/output-wc2"
2. 若要測試 skip功能，請建立一個"/home/nchc/patterns/patterns.txt" 檔，內容如下（一行一個，前置提示符號\）
        \.
        \,
        \!
3. 若要測試過濾大小寫功能，請加入 -c 參數（有-c 代表 "不考慮大小寫" ）
4. 注意 DistributedCache , setup() , conf 參數傳遞於 main, mapper, setup 中
}}}

 = WordCountV2.java =

{{{
#!java

package org.nchc.hadoop;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.StringUtils;

public class WordCountV20 {

	public static class Map extends
			Mapper<LongWritable, Text, Text, IntWritable> {
		static enum Counters {
			INPUT_WORDS
		}

		private final static IntWritable one = new IntWritable(1);
		private Text word = new Text();

		private boolean caseSensitive = true;
		private Set<String> patternsToSkip = new HashSet<String>();

		private void parseSkipFile(Path patternsFile) {
			try {
				BufferedReader fis = new BufferedReader(new FileReader(
						patternsFile.toString()));
				String pattern = null;
				while ((pattern = fis.readLine()) != null) {
					patternsToSkip.add(pattern);
				}
			} catch (IOException ioe) {
				System.err
						.println("Caught exception while parsing the cached file '"
								+ patternsFile
								+ "' : "
								+ StringUtils.stringifyException(ioe));
			}
		}

		@Override
		public void setup(Context context) {
			Configuration conf = context.getConfiguration();
			caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);

			if (conf.getBoolean("wordcount.skip.patterns", false)) {
				Path[] patternsFiles = new Path[0];
				try {
					patternsFiles = DistributedCache.getLocalCacheFiles(conf);
				} catch (IOException ioe) {
					System.err
							.println("Caught exception while getting cached files: "
									+ StringUtils.stringifyException(ioe));
				}
				for (Path patternsFile : patternsFiles) {
					parseSkipFile(patternsFile);
				}
			}
		}

		@Override
		public void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {

			String line = (caseSensitive) ? value.toString() : value.toString()
					.toLowerCase();

			for (String pattern : patternsToSkip) {
				line = line.replaceAll(pattern, "");
			}

			StringTokenizer tokenizer = new StringTokenizer(line);
			while (tokenizer.hasMoreTokens()) {
				word.set(tokenizer.nextToken());
				context.write(word, one);

			}

		}
	}

	public static class Reduce extends
			Reducer<Text, IntWritable, Text, IntWritable> {
		private IntWritable result = new IntWritable();

		@Override
		public void reduce(Text key, Iterable<IntWritable> values,
				Context context) throws IOException, InterruptedException {
			int sum = 0;
			for (IntWritable val : values) {
				sum += val.get();
			}
			result.set(sum);
			context.write(key, result);
		}
	}

	public static void main(String[] args) throws Exception {
		String[] argv = { "/home/nchc/input", "/home/nchc/output-wc2", "-c",  "-skip",
				"/home/nchc/patterns/patterns.txt" };

		args = argv;
		Configuration conf = new Configuration();
		conf.set("mapred.job.tracker", "local"); // for single
		conf.set("fs.default.name", "file:///"); // for single

		if (args.length < 2) {
			System.err
					.println("Usage: hadoop jar WordCount.jar <input> <output> [-c] [-skip <path>]");
			System.exit(2);
		}

		for (int i = 0; i < args.length; ++i) {
			if ("-skip".equals(args[i])) {
				DistributedCache
						.addCacheFile(new Path(args[++i]).toUri(), conf);
				conf.setBoolean("wordcount.skip.patterns", true);
			}
			if ("-c".equals(args[i])){
				conf.setBoolean("wordcount.case.sensitive", false);
			}
		}

		CheckAndDelete.checkAndDelete(args[1], conf);
		Job job = new Job(conf, "Word Count");
		job.setJarByClass(WordCountV20.class);
		job.setMapperClass(Map.class);
		job.setCombinerClass(Reduce.class);
		job.setReducerClass(Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

}}}