Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

DemoWordCondProb.java @ 57

Last change on this file since 57 was 20, checked in by waue, 16 years ago
將改完的 hadoop 0.17版package 放來備份目前繼續開發 hadoop 0.16 + hbase 1.3
File size: 6.8 KB

Rev	Line
[20]	1	/**
	2	* Program: DemoWordCondProb.java
	3	* Editor: Waue Chen
	4	* From : NCHC. Taiwn
	5	* Last Update Date: 07/02/2008
	6	* Re-code from : Cloud9: A MapReduce Library for Hadoop
	7	*/
	8	/*
	9	* Cloud9: A MapReduce Library for Hadoop
	10	*/
	11
	12	package tw.org.nchc.demo;
	13
	14	import java.io.IOException;
	15	import java.rmi.UnexpectedException;
	16	import java.util.HashMap;
	17	import java.util.Iterator;
	18	import java.util.StringTokenizer;
	19
	20	import org.apache.hadoop.fs.Path;
	21	import org.apache.hadoop.io.FloatWritable;
	22	import org.apache.hadoop.io.LongWritable;
	23	import org.apache.hadoop.mapred.JobClient;
	24	import org.apache.hadoop.mapred.JobConf;
	25	import org.apache.hadoop.mapred.MapReduceBase;
	26	import org.apache.hadoop.mapred.Mapper;
	27	import org.apache.hadoop.mapred.OutputCollector;
	28	import org.apache.hadoop.mapred.Partitioner;
	29	import org.apache.hadoop.mapred.Reducer;
	30	import org.apache.hadoop.mapred.Reporter;
	31	import org.apache.hadoop.mapred.SequenceFileInputFormat;
	32	import org.apache.hadoop.mapred.TextOutputFormat;
	33	import org.apache.hadoop.mapred.lib.IdentityReducer;
	34
	35	import tw.org.nchc.code.Convert;
	36	import tw.org.nchc.tuple.Schema;
	37	import tw.org.nchc.tuple.Tuple;
	38
	39	/**
	40	* <p>
	41	* Demo that illustrates the use of a Partitioner and special symbols in Tuple
	42	* to compute conditional probabilities. Demo builds on
	43	* {@link DemoWordCountTuple}, and has similar structure. Input comes from
	44	* Bible+Shakespeare sample collection, encoded as single-field tuples; see
	45	* {@link DemoPackRecords}. Sample of final output:
	46	*
	47	* <pre>
	48	* ...
	49	* (admirable, *) 15.0
	50	* (admirable, 0) 0.6
	51	* (admirable, 1) 0.4
	52	* (admiral, *) 6.0
	53	* (admiral, 0) 0.33333334
	54	* (admiral, 1) 0.6666667
	55	* (admiration, *) 16.0
	56	* (admiration, 0) 0.625
	57	* (admiration, 1) 0.375
	58	* (admire, *) 8.0
	59	* (admire, 0) 0.625
	60	* (admire, 1) 0.375
	61	* (admired, *) 19.0
	62	* (admired, 0) 0.6315789
	63	* (admired, 1) 0.36842105
	64	* ...
	65	* </pre>
	66	*
	67	* <p>
	68	* The first field of the key tuple contains a token. If the second field
	69	* contains the special symbol '*', then the value indicates the count of the
	70	* token in the collection. Otherwise, the value indicates p(EvenOrOdd\|Token),
	71	* the probability that a line is odd-length or even-length, given the
	72	* occurrence of a token.
	73	* </p>
	74	*/
	75	public class DemoWordCondProb {
	76
	77	// create the schema for the tuple that will serve as the key
	78	private static final Schema KEY_SCHEMA = new Schema();
	79
	80	// define the schema statically
	81	static {
	82	KEY_SCHEMA.addField("Token", String.class, "");
	83	KEY_SCHEMA.addField("EvenOrOdd", Integer.class, new Integer(1));
	84	}
	85
	86	// mapper that emits tuple as the key, and value '1' for each occurrence
	87	private static class MapClass extends MapReduceBase implements
	88	Mapper<LongWritable, Tuple, Tuple, FloatWritable> {
	89	private final static FloatWritable one = new FloatWritable(1);
	90	private Tuple tupleOut = KEY_SCHEMA.instantiate();
	91
	92	public void map(LongWritable key, Tuple tupleIn,
	93	OutputCollector<Tuple, FloatWritable> output, Reporter reporter)
	94	throws IOException {
	95
	96	// the input value is a tuple; get field 0
	97	// see DemoPackRecords of how input SequenceFile is generated
	98	String line = (String) ((Tuple) tupleIn).get(0);
	99	StringTokenizer itr = new StringTokenizer(line);
	100	while (itr.hasMoreTokens()) {
	101	String token = itr.nextToken();
	102
	103	// emit key-value pair for either even-length or odd-length line
	104	tupleOut.set("Token", token);
	105	tupleOut.set("EvenOrOdd", line.length() % 2);
	106	output.collect(tupleOut, one);
	107
	108	// emit key-value pair for the total count
	109	tupleOut.set("Token", token);
	110	// use special symbol in field 2
	111	tupleOut.setSymbol("EvenOrOdd", "*");
	112	output.collect(tupleOut, one);
	113	}
	114	}
	115	}
	116
	117	// reducer computes conditional probabilities
	118	private static class ReduceClass extends MapReduceBase implements
	119	Reducer<Tuple, FloatWritable, Tuple, FloatWritable> {
	120	// HashMap keeps track of total counts
	121	private final static HashMap<String, Integer> TotalCounts = new HashMap<String, Integer>();
	122
	123	public synchronized void reduce(Tuple tupleKey,
	124	Iterator<FloatWritable> values,
	125	OutputCollector<Tuple, FloatWritable> output, Reporter reporter)
	126	throws IOException {
	127	// sum values
	128	int sum = 0;
	129	while (values.hasNext()) {
	130	sum += values.next().get();
	131	}
	132
	133	String tok = (String) tupleKey.get("Token");
	134
	135	// check if the second field is a special symbol
	136	if (tupleKey.containsSymbol("EvenOrOdd")) {
	137	// emit total count
	138	output.collect(tupleKey, new FloatWritable(sum));
	139	// record total count
	140	TotalCounts.put(tok, sum);
	141	} else {
	142	if (!TotalCounts.containsKey(tok))
	143	throw new UnexpectedException("Don't have total counts!");
	144
	145	// divide sum by total count to obtain conditional probability
	146	float p = (float) sum / TotalCounts.get(tok);
	147
	148	// emit P(EvenOrOdd\|Token)
	149	output.collect(tupleKey, new FloatWritable(p));
	150	}
	151	}
	152	}
	153
	154	// partition by first field of the tuple, so that tuples corresponding
	155	// to the same token will be sent to the same reducer
	156	private static class MyPartitioner implements
	157	Partitioner<Tuple, FloatWritable> {
	158	public void configure(JobConf job) {
	159	}
	160
	161	public int getPartition(Tuple key, FloatWritable value,
	162	int numReduceTasks) {
	163	return (key.get("Token").hashCode() & Integer.MAX_VALUE)
	164	% numReduceTasks;
	165	}
	166	}
	167
	168	// dummy constructor
	169	private DemoWordCondProb() {
	170	}
	171
	172	/**
	173	* Runs the demo.
	174	*/
	175	public static void main(String[] args) throws IOException {
	176	String inPath = "/shared/sample-input/bible+shakes.nopunc.packed";
	177	String output1Path = "condprob";
	178	int numMapTasks = 20;
	179	int numReduceTasks = 10;
	180
	181	// first MapReduce cycle is to do the tuple counting
	182	JobConf conf1 = new JobConf(DemoWordCondProb.class);
	183	conf1.setJobName("DemoWordCondProb.MR1");
	184
	185	conf1.setNumMapTasks(numMapTasks);
	186	conf1.setNumReduceTasks(numReduceTasks);
	187	//0.16
	188	// conf1.setInputPath(new Path(inPath));
	189	Convert.setInputPath(conf1, new Path(inPath));
	190
	191	conf1.setInputFormat(SequenceFileInputFormat.class);
	192
	193	// 0.16
	194	// conf1.setOutputPath(new Path(output1Path));
	195	Convert.setOutputPath(conf1,new Path(output1Path));
	196	conf1.setOutputKeyClass(Tuple.class);
	197	conf1.setOutputValueClass(FloatWritable.class);
	198	conf1.setOutputFormat(TextOutputFormat.class);
	199
	200	conf1.setMapperClass(MapClass.class);
	201	// this is a potential gotcha! can't use ReduceClass for combine because
	202	// we have not collected all the counts yet, so we can't divide through
	203	// to compute the conditional probabilities
	204	conf1.setCombinerClass(IdentityReducer.class);
	205	conf1.setReducerClass(ReduceClass.class);
	206	conf1.setPartitionerClass(MyPartitioner.class);
	207
	208	JobClient.runJob(conf1);
	209	}
	210	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: sample/hadoop-0.17/tw/org/nchc/demo/DemoWordCondProb.java @ 57

Download in other formats: