Context Navigation

0118

Timestamp:: Jan 19, 2010, 11:29:36 AM (15 years ago)
Author:: waue
Comment:: --

Legend:

: Unmodified
: Added
: Removed
: Modified

waue/2010/0118

-                      v5
+                      v6
 {{{
 #!java
+        public static class wordindexM extends
+                        Mapper<LongWritable, Text, Text, Text> {
+                public void map(LongWritable key, Text value,
+                                OutputCollector<Text, Text> output, Reporter reporter)
+                                throws IOException {
+import java.io.IOException;
+import java.util.StringTokenizer;
                         FileSplit fileSplit = (FileSplit) reporter.getInputSplit();
                         String line = value.toString();
                         StringTokenizer st = new StringTokenizer(line.toLowerCase());
+                        while (st.hasMoreTokens()) {
                                 String word = st.nextToken();
+                                output.collect(new Text(word), new Text(fileSplit.getPath()
+                                                .getName()
                                                 + ":" + line));
+                        }
+                }
+        }
+}}}
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.util.GenericOptionsParser;
+遇到問題：
+{{{
+#!text
+/01/18 20:52:39 INFO input.FileInputFormat: Total input paths to process : 2
+/01/18 20:52:39 INFO mapred.JobClient: Running job: job_201001181452_0038
+/01/18 20:52:40 INFO mapred.JobClient:  map 0% reduce 0%
+/01/18 20:52:50 INFO mapred.JobClient: Task Id : attempt_201001181452_0038_m_000000_0, Status : FAILED
+java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, recieved org.apache.hadoop.io.LongWritable
+}}}
+ * 已解決
+{{{
+#!java
+        public static class wordindexM extends
+public class WordIndex {
+         public static class wordindexM extends
                         Mapper<LongWritable, Text, Text, Text> {
                 public void map(LongWritable key, Text value,
 …
                                 map_key.set(word);
                                 map_value.set(fileSplit.getPath().getName() + ":" + line);
                                 output.collect(map_key,map_value);
+                                output.collect(map_key, map_value);
+                        }
+                }
+      }
+        }
+        static public class wordindexR extends Reducer<Text, Text, Text, Text> {
+                public void reduce(Text key, Iterable<Text> values,
+                                OutputCollector<Text, Text> output, Reporter reporter)
+                                throws IOException {
+                        String v = "";
+                        StringBuilder ret = new StringBuilder("\n");
+                        for (Text val : values) {
+                                v += val.toString().trim();
+                                if (v.length() > 0)
+                                        ret.append(v + "\n");
+                        }
+                        output.collect((Text) key, new Text(ret.toString()));
+                }
+        }
+        public static void main(String[] args) throws IOException,
+                        InterruptedException, ClassNotFoundException {
+                // debug using
+                String[] argv = { "/user/waue/input", "/user/waue/output-wi" };
+                args = argv;
+                Configuration conf = new Configuration();
+                String[] otherArgs = new GenericOptionsParser(conf, args)
+                                .getRemainingArgs();
+                if (otherArgs.length < 2) {
+                        System.out.println("hadoop jar WordIndex.jar <inDir> <outDir>");
+                        return;
+                }
+                Job job = new Job(conf, "word index");
+                job.setJobName("word inverted index");
+                job.setJarByClass(WordIndex.class);
+                job.setMapOutputKeyClass(Text.class);
+                job.setMapOutputValueClass(Text.class);
+                job.setOutputKeyClass(Text.class);
+                job.setOutputValueClass(Text.class);
+                job.setMapperClass(wordindexM.class);
+                job.setReducerClass(wordindexR.class);
+                job.setCombinerClass(wordindexR.class);
+                FileInputFormat.setInputPaths(job, args[0]);
+                FileOutputFormat.setOutputPath(job, new Path(args[1]));
+                long start = System.nanoTime();
+                job.waitForCompletion(true);
+                long time = System.nanoTime() - start;
+                System.err.println(time * (1E-9) + " secs.");
+        }
+}
 }}}
+ * 解析
+遇到問題：
+{{{
+#!text
+/01/18 20:52:39 INFO input.FileInputFormat: Total input paths to process : 2
+/01/18 20:52:39 INFO mapred.JobClient: Running job: job_201001181452_0038
+/01/18 20:52:40 INFO mapred.JobClient:  map 0% reduce 0%
+/01/18 20:52:50 INFO mapred.JobClient: Task Id : attempt_201001181452_0038_m_000000_0, Status : FAILED
+java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, recieved org.apache.hadoop.io.LongWritable
+用output.collect寫入輸出串流，''' new Text(word) ''' 感覺用這個方法就可以把 Text 型態的資料寫入，
+}}}
+hadoop 0.18 可以這麼做不會出問題，但在 hadoop 0.20 之後，如果遇到''' " Type mismatch in key from xxx " ''' 問題時
+ * 目前認為是
+可以換成用 Text.set() 方法來解決問題 ！
+{{{
+#!java
+         public static class wordindexM extends
+                        Mapper<LongWritable, Text, Text, Text> {
+                public void map(LongWritable key, Text value,
+                                OutputCollector<Text, Text> output, Reporter reporter)
+                                throws IOException {
+}}}
+用了新的 org.apache.hadoop.mapreduce.Mapper  可以省略掉不用 extends ..  implements ... 的宣告，
+不過它預設搭配的map 實做方法是 ''' "public void map(LongWritable key, Text value, Context context) " ''' 而非
+'''  "public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) " '''
+ * 解決辦法