Changes between Version 1 and Version 2 of Hadoop_Lab5_1


Ignore:
Timestamp:
Sep 15, 2009, 4:47:51 PM (15 years ago)
Author:
waue
Comment:

--

Legend:

Unmodified
Added
Removed
Modified
  • Hadoop_Lab5_1

    v1 v2  
    247247 * 在這我們編輯一個範例程式 : WordCount
    248248
    249  == 3.1 mapper.java ==
    250  
    251  1. new
    252  
    253  || File ->  || new ->  || mapper ||
    254 [[Image(wiki:waue/2009/0617:file-new-mapper.png)]]
    255 
    256 -----------
    257 
    258  2. create
    259  
    260 [[Image(wiki:waue/2009/0617:3-1.png)]]
    261 {{{
    262 #!sh
    263 source folder-> 輸入: icas/src
    264 Package : Sample
    265 Name -> : mapper
    266 }}}
    267 ----------
    268 
    269  3. modify
    270  
    271 {{{
    272 #!java
    273 package Sample;
    274 
    275 import java.io.IOException;
    276 import java.util.StringTokenizer;
    277 
    278 import org.apache.hadoop.io.IntWritable;
    279 import org.apache.hadoop.io.LongWritable;
    280 import org.apache.hadoop.io.Text;
    281 import org.apache.hadoop.mapred.MapReduceBase;
    282 import org.apache.hadoop.mapred.Mapper;
    283 import org.apache.hadoop.mapred.OutputCollector;
    284 import org.apache.hadoop.mapred.Reporter;
    285 
    286 public class mapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
    287     private final static IntWritable one = new IntWritable(1);
    288     private Text word = new Text();
    289 
    290     public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
    291       String line = value.toString();
    292       StringTokenizer tokenizer = new StringTokenizer(line);
    293       while (tokenizer.hasMoreTokens()) {
    294         word.set(tokenizer.nextToken());
    295         output.collect(word, one);
    296       }
    297     }
    298   }
    299 
    300 }}}
    301 
    302 建立mapper.java後,貼入程式碼
    303 [[Image(wiki:waue/2009/0617:3-2.png)]]
    304 
    305 ------------
    306 
    307 == 3.2 reducer.java ==
    308 
    309  1. new
    310 
    311  * File -> new -> reducer
    312 [[Image(wiki:waue/2009/0617:file-new-reducer.png)]]
    313 
    314 -------
    315  2. create
    316 [[Image(wiki:waue/2009/0617:3-3.png)]]
    317 
    318 {{{
    319 #!sh
    320 source folder-> 輸入: icas/src
    321 Package : Sample
    322 Name -> : reducer
    323 }}}
    324 
    325 -----------
    326 
    327  3. modify
    328  
    329 {{{
    330 #!java
    331 package Sample;
    332 
    333 import java.io.IOException;
    334 import java.util.Iterator;
    335 
    336 import org.apache.hadoop.io.IntWritable;
    337 import org.apache.hadoop.io.Text;
    338 import org.apache.hadoop.mapred.MapReduceBase;
    339 import org.apache.hadoop.mapred.OutputCollector;
    340 import org.apache.hadoop.mapred.Reducer;
    341 import org.apache.hadoop.mapred.Reporter;
    342 
    343 public class reducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
    344     public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
    345       int sum = 0;
    346       while (values.hasNext()) {
    347         sum += values.next().get();
    348       }
    349       output.collect(key, new IntWritable(sum));
    350     }
    351   }
    352 }}}
    353 
    354  * File -> new -> Map/Reduce Driver
    355 [[Image(wiki:waue/2009/0617:file-new-mr-driver.png)]]
    356 ----------
    357 
    358 == 3.3 WordCount.java (main function) ==
    359 
    360  1. new
    361 
    362 建立WordCount.java,此檔用來驅動mapper 與 reducer,因此選擇 Map/Reduce Driver
    363 
    364 
    365 [[Image(wiki:waue/2009/0617:3-4.png)]]
    366 ------------
    367 
    368  2. create
    369 
    370 {{{
    371 #!sh
    372 source folder-> 輸入: icas/src
    373 Package : Sample
    374 Name -> : WordCount.java
    375 }}}
    376 
    377 -------
    378  3. modify
    379 
    380 {{{
    381 #!java
    382 package Sample;
    383 import org.apache.hadoop.fs.Path;
    384 import org.apache.hadoop.io.IntWritable;
    385 import org.apache.hadoop.io.Text;
    386 import org.apache.hadoop.mapred.FileInputFormat;
    387 import org.apache.hadoop.mapred.FileOutputFormat;
    388 import org.apache.hadoop.mapred.JobClient;
    389 import org.apache.hadoop.mapred.JobConf;
    390 import org.apache.hadoop.mapred.TextInputFormat;
    391 import org.apache.hadoop.mapred.TextOutputFormat;
    392 
    393 public class WordCount {
    394 
    395    public static void main(String[] args) throws Exception {
    396      JobConf conf = new JobConf(WordCount.class);
    397      conf.setJobName("wordcount");
    398 
    399      conf.setOutputKeyClass(Text.class);
    400      conf.setOutputValueClass(IntWritable.class);
    401 
    402      conf.setMapperClass(mapper.class);
    403      conf.setCombinerClass(reducer.class);
    404      conf.setReducerClass(reducer.class);
    405 
    406      conf.setInputFormat(TextInputFormat.class);
    407      conf.setOutputFormat(TextOutputFormat.class);
    408 
    409     FileInputFormat.setInputPaths(conf, new Path("/user/hadooper/input"));
    410     FileOutputFormat.setOutputPath(conf, new Path("lab5_out2"));
    411 
    412      JobClient.runJob(conf);
    413    }
    414 }
    415 }}}
    416 
    417 三個檔完成後並存檔後,整個程式建立完成
    418 [[Image(wiki:waue/2009/0617:3-5.png)]]
    419 
    420 -------
    421 
    422  * 三個檔都存檔後,可以看到icas專案下的src,bin都有檔案產生,我們用指令來check
    423  
    424 {{{
    425 $ cd workspace/icas
    426 $ ls src/Sample/
    427 mapper.java  reducer.java  WordCount.java
    428 $ ls bin/Sample/
    429 mapper.class  reducer.class  WordCount.class
    430 }}}
    431249
    432250 = 四、測試範例程式 =
    433 
    434 在此提供兩種方法來run我們從eclipse 上編譯出的code。
    435 
    436 方法一是直接在eclipse上用圖形介面操作,參閱 4.1  在eclipse上操作
    437 
    438 方法二是產生jar檔後搭配自動編譯程式Makefile,參閱4.2
    439 
    440 
    441  == 4.1 法一:在eclipse上操作 ==
    442251
    443252 * 右鍵點選專案資料夾:icas -> run as -> run on Hadoop
     
    446255
    447256
    448 
     257= 五、結論 =
     258
     259 * 搭配eclipse ,我們可以更有效率的開發hadoop
     260 * hadoop 0.20 與之前的版本api以及設定都有些改變,可以看 [wiki:waue/2009/0617 hadoop 0.20 coding (eclipse )]
     261 * 有更多的時間請見 [http://trac.nchc.org.tw/cloud/wiki/Hadoop_Lab5_2 進階版]