| 1 | ◢ <[wiki:Hinet131105/Lab19 實作十九]> | <[wiki:Hinet131105 回課程大綱]> ▲ | <[wiki:Hinet131105/Lab21 實作二十一]> ◣ |
| 2 | |
| 3 | = 實作二十 Lab20 = |
| 4 | |
| 5 | {{{ |
| 6 | #!html |
| 7 | <p style="text-align: center;"><big style="font-weight: bold;"><big>預設的輸入格式<br/>TextInputFormat</big></big></p> |
| 8 | }}} |
| 9 | |
| 10 | [[PageOutline]] |
| 11 | |
| 12 | {{{ |
| 13 | #!text |
| 14 | 請先連線至 nodeN.3du.me , N 為您的報名編號 |
| 15 | }}} |
| 16 | |
| 17 | * 為了觀察 !FileInputFormat 的行為,我們使用 update jar 的技巧,對 !TextInputFormat.java 做了小幅度的修改。 |
| 18 | * 官方實作的 !TextInputFormat.java 有兩個(一個新版,一個舊版) |
| 19 | {{{ |
| 20 | user@node1:~$ find /home/user/hadoop/src/ -name "TextInputFormat.java" |
| 21 | /home/user/hadoop/src/mapred/org/apache/hadoop/mapreduce/lib/input/TextInputFormat.java |
| 22 | /home/user/hadoop/src/mapred/org/apache/hadoop/mapred/TextInputFormat.java |
| 23 | }}} |
| 24 | * 這裡我們修改新版的 !TextInputFormat.java |
| 25 | {{{ |
| 26 | #!diff |
| 27 | --- /home/user/hadoop/src/mapred/org/apache/hadoop/mapreduce/lib/input/TextInputFormat.java 2012-10-03 13:17:16.000000000 +0800 |
| 28 | +++ /home/user/hadoop_labs/lab011/src/TextInputFormat.java 2013-10-19 11:25:16.419320587 +0800 |
| 29 | @@ -38,11 +38,13 @@ |
| 30 | public RecordReader<LongWritable, Text> |
| 31 | createRecordReader(InputSplit split, |
| 32 | TaskAttemptContext context) { |
| 33 | + System.err.println("TextInputFormat.createRecordReader()"); |
| 34 | return new LineRecordReader(); |
| 35 | } |
| 36 | |
| 37 | @Override |
| 38 | @Override |
| 39 | protected boolean isSplitable(JobContext context, Path file) { |
| 40 | + System.err.println("TextInputFormat.isSplitable(context," + file.toString() + ")"); |
| 41 | CompressionCodec codec = |
| 42 | new CompressionCodecFactory(context.getConfiguration()).getCodec(file); |
| 43 | return codec == null; |
| 44 | }}} |
| 45 | * 讓我們先來觀察一下執行的結果 |
| 46 | {{{ |
| 47 | unset HADOOP_CONF_DIR |
| 48 | cd ~/hadoop_labs/lab011 |
| 49 | ant |
| 50 | cd ~/hadoop_labs/lab010 |
| 51 | mkdir -p my_input |
| 52 | echo "A B C D" > my_input/input1 |
| 53 | echo "C D A B" > my_input/input2 |
| 54 | hadoop fs -put my_input my_input |
| 55 | sed -i 's#setNumReduceTasks(0)#setNumReduceTasks(1)#g' ~/hadoop_labs/lab010/src/WordCount.java |
| 56 | ant |
| 57 | hadoop jar WordCount.jar my_input my_output |
| 58 | }}} |
| 59 | |
| 60 | * |
| 61 | {{{ |
| 62 | export HADOOP_CONF_DIR=~/hadoop/conf.local/ |
| 63 | hadoop jar WordCount.jar my_input my_output |
| 64 | unset HADOOP_CONF_DIR |
| 65 | }}} |
| 66 | |
| 67 | * Reference: |
| 68 | 1. http://hadoop.apache.org/docs/r1.0.4/api/org/apache/hadoop/mapreduce/InputFormat.html |
| 69 | 2. http://hadoop.apache.org/docs/r1.0.4/api/org/apache/hadoop/mapreduce/lib/input/FileInputFormat.html |
| 70 | 3. http://hadoop.apache.org/docs/r1.0.4/api/org/apache/hadoop/mapreduce/lib/input/TextInputFormat.html |
| 71 | |
| 72 | == 實作習題 == |
| 73 | |
| 74 | <問題 1> 當運行於全分散式模式,請問執行任務時,您觀察到幾行 "!TextInputFormat.isSplitable" |
| 75 | |
| 76 | <問題 2> 當運行於全分散式模式,請問執行任務時,您觀察到幾行 "!TextInputFormat.createRecordReader" |
| 77 | |
| 78 | <問題 3> 當運行於單機模式,請問執行任務時,您觀察到幾行 "!TextInputFormat.isSplitable" |
| 79 | |
| 80 | <問題 4> 當運行於單機模式,請問執行任務時,您觀察到幾行 "!TextInputFormat.createRecordReader" |