| 1 | |
| 2 | = hadoop streaming + perl = |
| 3 | |
| 4 | 僅看到,未測 |
| 5 | |
| 6 | 注意 |
| 7 | 目前 streaming 對 linux pipe #也就是 cat |wc -l 這樣的管道 不支持,但不妨礙我們使用perl,python 行式命令!! |
| 8 | 原話是 : |
| 9 | |
| 10 | Can I use UNIX pipes? For example, will -mapper "cut -f1 | sed s/foo/bar/g" work? |
| 11 | Currently this does not work and gives an "java.io.IOException: Broken pipe" error. |
| 12 | This is probably a bug that needs to be investigated. |
| 13 | 但如果你是強烈的 linux shell pipe 發燒友 ! 參考下面 |
| 14 | |
| 15 | {{{ |
| 16 | $> perl -e 'open( my $fh, "grep -v null tt |sed -n 1,5p |");while ( <$fh> ) {print;} ' |
| 17 | #不過我沒測試通過 !! |
| 18 | }}} |
| 19 | 環境 :hadoop-0.18.3 |
| 20 | |
| 21 | {{{ |
| 22 | $> find . -type f -name "*streaming*.jar" |
| 23 | ./contrib/streaming/hadoop-0.18.3-streaming.jar |
| 24 | }}} |
| 25 | |
| 26 | 測試數據: |
| 27 | |
| 28 | {{{ |
| 29 | $ head tt |
| 30 | null false 3702 208100 |
| 31 | 6005100 false 70 13220 |
| 32 | 6005127 false 24 4640 |
| 33 | 6005160 false 25 4820 |
| 34 | 6005161 false 20 3620 |
| 35 | 6005164 false 14 1280 |
| 36 | 6005165 false 37 7080 |
| 37 | 6005168 false 104 20140 |
| 38 | 6005169 false 35 6680 |
| 39 | 6005240 false 169 32140 |
| 40 | ...... |
| 41 | }}} |
| 42 | |
| 43 | 運行: |
| 44 | {{{ |
| 45 | #!perl |
| 46 | |
| 47 | c1=" perl -ne 'if(/.*\t(.*)/){\$sum+=\$1;}END{print \"\$sum\";}' " |
| 48 | # 注意 這裡 $ 要寫成 \$ " 寫成 \" |
| 49 | |
| 50 | echo $c1; # 打印輸出 perl -ne 'if(/.*"t(.*)/){$sum+=$1;}END{print $sum;}' |
| 51 | hadoop jar hadoop-0.18.3-streaming.jar |
| 52 | -input file:///data/hadoop/lky/jar/tt |
| 53 | -mapper "/bin/cat" |
| 54 | -reducer "$c1" |
| 55 | -output file:///tmp/lky/streamingx8 |
| 56 | }}} |
| 57 | |
| 58 | 結果: |
| 59 | {{{ |
| 60 | cat /tmp/lky/streamingx8/* |
| 61 | 1166480 |
| 62 | }}} |
| 63 | |
| 64 | 本地運行輸出: |
| 65 | |
| 66 | {{{ |
| 67 | perl -ne 'if(/.*"t(.*)/){$sum+=$1;}END{print $sum;}' < tt |
| 68 | 1166480 |
| 69 | }}} |
| 70 | |
| 71 | 結果正確!!!! |
| 72 | |
| 73 | |
| 74 | 命令自帶文檔: |
| 75 | {{{ |
| 76 | -bash-3.00$ hadoop jar hadoop-0.18.3-streaming.jar -info |
| 77 | 09/09/25 14:50:12 ERROR streaming.StreamJob: Missing required option -input |
| 78 | Usage: $HADOOP_HOME/bin/hadoop [--config dir] jar \ |
| 79 | $HADOOP_HOME/hadoop-streaming.jar [options] |
| 80 | Options: |
| 81 | -input <path> DFS input file(s) for the Map step |
| 82 | -output <path> DFS output directory for the Reduce step |
| 83 | -mapper <cmd|JavaClassName> The streaming command to run |
| 84 | -combiner <JavaClassName> Combiner has to be a Java class |
| 85 | -reducer <cmd|JavaClassName> The streaming command to run |
| 86 | -file <file> File/dir to be shipped in the Job jar file |
| 87 | -dfs <h:p>|local Optional. Override DFS configuration |
| 88 | -jt <h:p>|local Optional. Override JobTracker configuration |
| 89 | -additionalconfspec specfile Optional. |
| 90 | -inputformat TextInputFormat(default)|SequenceFileAsTextInputFormat|JavaClassName Optional. |
| 91 | -outputformat TextOutputFormat(default)|JavaClassName Optional. |
| 92 | -partitioner JavaClassName Optional. |
| 93 | -numReduceTasks <num> Optional. |
| 94 | -inputreader <spec> Optional. |
| 95 | -jobconf <n>=<v> Optional. Add or override a JobConf property |
| 96 | -cmdenv <n>=<v> Optional. Pass env.var to streaming commands |
| 97 | -mapdebug <path> Optional. To run this script when a map task fails |
| 98 | -reducedebug <path> Optional. To run this script when a reduce task fails |
| 99 | -cacheFile fileNameURI |
| 100 | -cacheArchive fileNameURI |
| 101 | -verbose |
| 102 | }}} |
| 103 | |
| 104 | 參考: |