| | 1 | [[PageOutline]] |
| | 2 | |
| | 3 | = '''使用Eclipse開發!MapReduce程式''' = |
| | 4 | |
| | 5 | = 一、準備 = |
| | 6 | * 系統 : |
| | 7 | * Ubuntu 8.10 |
| | 8 | * Hadoop 0.18.3 |
| | 9 | * 下載安裝方法於 2.2 說明 |
| | 10 | |
| | 11 | * 開發工具 : |
| | 12 | * Eclipse 3.2.2 |
| | 13 | |
| | 14 | || || 指令 || 註解 || |
| | 15 | || $ || apt-get install eclipse || 安裝eclipse || |
| | 16 | |
| | 17 | * java 6 |
| | 18 | |
| | 19 | || || 指令 || 註解 || |
| | 20 | || $ || sudo apt-get purge java-gcj-compat || 由於版權關係,ubuntu預設安裝的gcj為java的模擬軟體,請移除 || |
| | 21 | || $ || sudo apt-get install sun-java6-bin sun-java6-jdk sun-java6-jre sun-java6-plugin || 安裝 Sun版Java || |
| | 22 | |
| | 23 | * 設定環境變數 |
| | 24 | * 開啟bash.bashrc 貼上環境變數設定(sudo gedit /etc/bash.bashrc),在最後一行貼入下面內容 |
| | 25 | {{{ |
| | 26 | export JAVA_HOME=/usr/lib/jvm/java-6-sun |
| | 27 | export HADOOP_HOME=/opt/hadoop/ |
| | 28 | export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar |
| | 29 | }}} |
| | 30 | * 由前面設定後,環境參數如下表 |
| | 31 | |
| | 32 | || Name || Path || |
| | 33 | || Hadoop Home || /opt/hadoop/ || |
| | 34 | || Java Home || /usr/lib/jvm/java-6-sun || |
| | 35 | |
| | 36 | = 二、安裝與設定 Hadoop = |
| | 37 | Hadoop 是Apache所維護的自由軟體專案,而HDFS (Hadoop Distributed File System)是指Hadoop檔案系統,因此安裝的時候我們安裝配置了Hadoop專案,但操作時,尤其指Hadoop系統操作,我們會用HDFS來稱呼。因此,整個邏輯可以看成,我們在Linux系統內安裝配置了Hadoop,執行之後,它會產生一個新的分散式的磁碟系統,也就是HDFS,架構在Linux之內。由於是分散式系統,意味著他可以橫跨多台Linux所形成的叢集系統,並聯其磁碟空間。不過此篇的目的為撰寫MapReduce程式,所以在此示範的設定為HDFS運作於單一的本機系統內。 |
| | 38 | |
| | 39 | == 2.1 幫User產生ssh金鑰 == |
| | 40 | |
| | 41 | || || 指令 || 註解 || |
| | 42 | ||$|| ssh-keygen -t rsa -P "" || 產生免密碼的ssh金鑰 || |
| | 43 | ||$|| cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys || 匯入免檢查名單 || |
| | 44 | ||$|| ssh localhost || 測試登入是否不用密碼 || |
| | 45 | ||$|| exit || 完成離開 || |
| | 46 | |
| | 47 | == 2.2 安裝 Hadoop == |
| | 48 | |
| | 49 | || || 指令 || 註解 || |
| | 50 | || $ || wget http://ftp.twaren.net/Unix/Web/apache/hadoop/core/hadoop-0.18.3/hadoop-0.18.3.tar.gz || 下載原始碼 || |
| | 51 | || $ || sudo tar -zxvf hadoop-0.18.3.tar.gz -C /opt/ || 解壓縮 || |
| | 52 | || $ || sudo chown -R waue:waue /opt/hadoop-0.18.3 || 改權限 || |
| | 53 | || $ || sudo ln -sf /opt/hadoop-0.18.3 /opt/hadoop || 建目錄連結 || |
| | 54 | || $ || cd /opt/hadoop || || |
| | 55 | |
| | 56 | == 2.3 設定 == |
| | 57 | * 修改 hadoop-env.sh 檔 ($ gedit /opt/hadoop/conf/hadoop-env.sh ) |
| | 58 | * 修改內容: |
| | 59 | {{{ |
| | 60 | #!diff |
| | 61 | --- /opt/hadoop/conf/hadoop-env.sh.bek |
| | 62 | +++ /opt/hadoop/conf/hadoop-env.sh |
| | 63 | @@ -8,9 +8,12 @@ |
| | 64 | # The java implementation to use. Required. |
| | 65 | -# export JAVA_HOME=/usr/lib/j2sdk1.5-sun |
| | 66 | +export JAVA_HOME=/usr/lib/jvm/java-6-sun |
| | 67 | +export HADOOP_HOME=/opt/hadoop |
| | 68 | +export HADOOP_LOG_DIR=$HADOOP_HOME/logs |
| | 69 | +export HADOOP_SLAVES=$HADOOP_HOME/conf/slaves |
| | 70 | }}} |
| | 71 | |
| | 72 | |
| | 73 | * 修改 hadoop-site.xml 檔($ gedit HADOOP_HOME/conf/hadoop-site.xml) |
| | 74 | * 整段貼上: |
| | 75 | {{{ |
| | 76 | #!xml |
| | 77 | <configuration> |
| | 78 | <property> |
| | 79 | <name>fs.default.name</name> |
| | 80 | <value>hdfs://localhost:9000/</value> |
| | 81 | <description> |
| | 82 | </description> |
| | 83 | </property> |
| | 84 | <property> |
| | 85 | <name>mapred.job.tracker</name> |
| | 86 | <value>hdfs://localhost:9001/</value> |
| | 87 | <description> |
| | 88 | </description> |
| | 89 | </property> |
| | 90 | <property> |
| | 91 | <name>mapred.map.tasks</name> |
| | 92 | <value>1</value> |
| | 93 | <description> |
| | 94 | define mapred.map tasks to be number of slave hosts |
| | 95 | </description> |
| | 96 | </property> |
| | 97 | <property> |
| | 98 | <name>mapred.reduce.tasks</name> |
| | 99 | <value>1</value> |
| | 100 | <description> |
| | 101 | define mapred.reduce tasks to be number of slave hosts |
| | 102 | </description> |
| | 103 | </property> |
| | 104 | <property> |
| | 105 | <name>dfs.replication</name> |
| | 106 | <value>1</value> |
| | 107 | </property> |
| | 108 | </configuration> |
| | 109 | }}} |
| | 110 | |
| | 111 | == 2.4 啟動 HDFS == |
| | 112 | * 格式化HDFS (Hadoop Distributed File System) |
| | 113 | |
| | 114 | || || 指令 || |
| | 115 | || $ || cd $HADOOP_HOME || |
| | 116 | || $ || bin/hadoop namenode -format || |
| | 117 | * 執行結果: |
| | 118 | {{{ |
| | 119 | 09/02/03 18:08:59 INFO dfs.NameNode: STARTUP_MSG: |
| | 120 | /************************************************************ |
| | 121 | STARTUP_MSG: Starting NameNode |
| | 122 | STARTUP_MSG: host = vPro/140.110.138.193 |
| | 123 | STARTUP_MSG: args = [-format] |
| | 124 | STARTUP_MSG: version = 0.18.3 |
| | 125 | STARTUP_MSG: build = https://svn.apache.org/repos/asf/hadoop/core/branches/branch-0.18 -r 736250; compiled by 'ndaley' on Thu Jan 22 23:12:08 UTC 2009 |
| | 126 | ************************************************************/ |
| | 127 | 09/02/03 18:08:59 INFO fs.FSNamesystem: fsOwner=waue,waue,adm,dialout,cdrom,floppy,audio,dip,video,plugdev,fuse,lpadmin,admin,sambashare |
| | 128 | 09/02/03 18:08:59 INFO fs.FSNamesystem: supergroup=supergroup |
| | 129 | 09/02/03 18:08:59 INFO fs.FSNamesystem: isPermissionEnabled=true |
| | 130 | 09/02/03 18:08:59 INFO dfs.Storage: Image file of size 78 saved in 0 seconds. |
| | 131 | 09/02/03 18:08:59 INFO dfs.Storage: Storage directory /tmp/hadoop-waue/dfs/name has been successfully formatted. |
| | 132 | 09/02/03 18:08:59 INFO dfs.NameNode: SHUTDOWN_MSG: |
| | 133 | /************************************************************ |
| | 134 | SHUTDOWN_MSG: Shutting down NameNode at vPro/140.110.138.193 |
| | 135 | ************************************************************/ |
| | 136 | }}} |
| | 137 | * 啟動HDFS |
| | 138 | |
| | 139 | || || 指令 || |
| | 140 | || $ || bin/start-all.sh || |
| | 141 | * 執行結果: |
| | 142 | {{{ |
| | 143 | starting namenode, logging to /opt/hadoop/logs/hadoop-waue-namenode-vPro.out |
| | 144 | |
| | 145 | localhost: starting datanode, logging to /opt/hadoop/logs/hadoop-waue-datanode-vPro.out |
| | 146 | |
| | 147 | localhost: starting secondarynamenode, logging to /opt/hadoop/logs/hadoop-waue-secondarynamenode-vPro.out |
| | 148 | |
| | 149 | starting jobtracker, logging to /opt/hadoop/logs/hadoop-waue-jobtracker-vPro.out |
| | 150 | |
| | 151 | localhost: starting tasktracker, logging to /opt/hadoop/logs/hadoop-waue-tasktracker-vPro.out |
| | 152 | |
| | 153 | }}} |
| | 154 | |
| | 155 | * 在瀏覽器URL列輸入以下三個網址,若都有畫面則成功啟動 |
| | 156 | |
| | 157 | || 網址 || 說明 || |
| | 158 | || http://localhost:50030 || Map/Reduce Administration || |
| | 159 | || http://localhost:50060 || Task Tracker !StatusTask Tracker Status || |
| | 160 | || http://localhost:50070 || !NameNode || |
| | 161 | |
| | 162 | == 2.5 除錯 == |
| | 163 | * 如果你的系統在啟動的時候有錯誤發生,則請停止Hadoop並完全移除Hadoop產生的中間資料,再重新開始 |
| | 164 | |
| | 165 | || || 指令 || |
| | 166 | || $ || cd $HADOOP_HOME || |
| | 167 | || $ || bin/stop-all.sh || |
| | 168 | || $ || rm -rf /tmp/* || |
| | 169 | || || 檢查 2.3 內容,重新執行2.4內容 || |
| | 170 | |
| | 171 | |
| | 172 | = 三、 Hadoop的Eclipse-Plugin安裝與操作 = |
| | 173 | Elipse是Sun公司所開發,是個很知名的Java程式語言開發程式,雖然Hadoop也是用java所開發,但要讓eclipse認得Hadoop的API、編譯器、並執行程式,需要調校的地方很多,步驟也很繁瑣。IBM發佈了一個Eclipse插件-IBM MapReduce Tools for Eclipse,通過該插件,開發者可以在Eclipse上創建MapReduce應用程序。以下我們就來示範如何安裝這個Hadoop的eclipse-plugin。 |
| | 174 | |
| | 175 | == 3.1 eclipse 3.3 以上版本(有bug) == |
| | 176 | * 安裝IBM !MapReduce tool |
| | 177 | |
| | 178 | || || 指令 || |
| | 179 | || $ || sudo cp /opt/hadoop-0.18.3/contrib/eclipse-plugin/hadoop-0.18.3-eclipse-plugin.jar /usr/lib/eclipse/plugins/ || |
| | 180 | |
| | 181 | ps : 上述的方法需eclipse 3.3 以上 搭配 hadoop 0.17 以上版本。 |
| | 182 | |
| | 183 | * 在系統視窗中,右鍵點選eclipse圖示,在指令欄加入參數以增加穩定度 |
| | 184 | {{{ |
| | 185 | /usr/bin/eclipse -vmargs -Xmx512M |
| | 186 | }}} |
| | 187 | |
| | 188 | * 啟動eclipse,從視窗介面操作: |
| | 189 | |
| | 190 | || 視窗操作 || 介面中設定 || 註解 || |
| | 191 | || '''File''' > '''New > Project''' || 看到 '''!MapReduce category''' || 檢查是否安裝IBM !MapReduce tool成功 || |
| | 192 | || '''Window''' > '''Preferences''' > '''java'''> '''compiler''' ||設定 '''compiler compliance level''' 為 1.6 || 更改java編譯器為1.6 以上,否則會出現一堆error|| |
| | 193 | |
| | 194 | * 開啟hadoop 專案 |
| | 195 | |
| | 196 | || 視窗操作 || 介面中設定 || 註解 || |
| | 197 | || '''File''' > '''new''' > '''Map/Reduce Project'''>'''next''' || '''Project name''':''sample'' [[br]] '''Configure Hadoop install directory''' => /opt/hadoop || 設定專案名稱及hadoop的家目錄 || |
| | 198 | * 完成後 Project Explorer可以看到 sample的專案列出 |
| | 199 | |
| | 200 | * 讓Eclipse連接到HDFS |
| | 201 | |
| | 202 | || 視窗操作 || 介面中設定 || 註解 || |
| | 203 | || '''Window''' > '''Show View''' > '''Other...''' > '''!MapReduce Tools''' || '''!MapReduce locations''' || 開啟MapReduce編譯環境,完成後右下方視窗會多一個藍色的大象圖示 || |
| | 204 | || 點選右下藍色大象圖示 || '''location name''' : ''test'' [[br]] '''Host''':''localhost'' [[br]] M/R Master: '''Port''':''9001'' [[br]] DFS Master: '''Use Use M/R Master host''':''V'' [[br]] '''Port''':''9000''|| 設定對應到 hadoop-site.xml檔 || |
| | 205 | |
| | 206 | * mapred.job.tracker 的設定若為 hdfs://ubuntu:9010/ ,則M/R Master的設定為 '''host:ubuntu''' , '''Port:9010''' ,DFS Master的設定則對應到 fs.default.name |
| | 207 | * 注意 hdfs必須已經啟動,否則無法點選finish鍵 |
| | 208 | * 完成後在Project Explorer可以看到'''DFS Locations'''出現 |
| | 209 | |
| | 210 | * 2009/2/5 測試到目前為止,用最新版的eclipse 3.4 搭配 0.18.3 或 0.19.0 版的 hadoop plugin,在"run as" => "run on hadoop" 都無法正常運作(應該要彈出選擇server location的視窗),此問題在2008/7/31就被發現並已被發佈於[http://webui.sourcelabs.com/hadoop/issues/3744 (HADOOP-3744) Eclipse Plugin does not work with Eclipse Ganymede (3.4)],目前尚待解決。 |
| | 211 | |
| | 212 | |
| | 213 | |
| | 214 | == 3.2 eclipse 3.2 以下版本 == |
| | 215 | * 安裝IBM !MapReduce tool |
| | 216 | |
| | 217 | eclipse 3.2 以前的版本必須要去IBM官方網站下載mapReduce_tools.zip,解壓縮後將資料夾複製到$eclipse/plugins/才可以運作 |
| | 218 | |
| | 219 | * 在系統視窗中,右鍵點選eclipse圖示,在指令欄加入參數以增加穩定度 |
| | 220 | {{{ |
| | 221 | /usr/bin/eclipse -vmargs -Xmx512m |
| | 222 | }}} |
| | 223 | |
| | 224 | * 啟動eclipse,從視窗介面操作: |
| | 225 | |
| | 226 | || 視窗操作 || 介面中設定 || 註解 || |
| | 227 | || '''File''' > '''New > Project''' || 看到 '''!MapReduce category''' || 檢查是否安裝IBM !MapReduce tool成功 || |
| | 228 | || '''Window''' > '''Preferences''' > '''java'''> '''compiler''' ||設定 '''compiler compliance level''' 為 5.0 || 更改java編譯器為5.0以上,否則會出現一堆error|| |
| | 229 | |
| | 230 | * 啟動eclipse,從視窗介面操作 |
| | 231 | |
| | 232 | || 視窗操作 || 介面中設定 || 註解 || |
| | 233 | || '''File''' > '''new''' > '''project''' > '''map-reduce project''' > '''next''' > || '''project name''' : ''sample'' [[br]] '''use default location''' : V [[br]] '''use default Hadoop''' : V [[br]] > '''Finish''' [[br]] || 設定專案預設值 || |
| | 234 | |
| | 235 | * 讓Eclipse連接到HDFS |
| | 236 | |
| | 237 | || 視窗操作 || 介面中設定 || 註解 || |
| | 238 | || '''Window''' > '''Show View''' > '''Other...''' > '''!MapReduce Tools''' > '''!MapReduce Servers''' || '''!MapReduce Servers''' || 開啟MapReduce編譯環境,完成後右下方視窗會多一個藍色的大象圖示 || |
| | 239 | || 點選藍色大象圖示 || '''Server name''' : ''any_you_want'' [[br]] '''Hostname''' : ''localhost'' [[br]] '''Installation directory''': ''/opt/hadoop/'' [[br]] '''Username''' : ''waue'' [[br]] || 開啟Hadoop伺服器,若有任何的密碼提示對話框出現,請填入登入Linux的系統使用者之密碼 || |
| | 240 | |
| | 241 | |
| | 242 | |
| | 243 | |
| | 244 | |
| | 245 | = 四、用Eclipse編譯!MapReduce範例程式 = |
| | 246 | 在這個範例中,我們先上傳一個純文字檔到HDFS內,這個檔案的副檔名為何不重要,只要內文為純文字檔即可,簡單的可以直接把Readme檔上傳上去,但為了突顯MapReduce的威力,當然是放越大的檔案上去給Hadoop操一下越好囉!接著示範如何用Eclipse來編寫!MapReduce的程式,並執行編譯的動作。 |
| | 247 | |
| | 248 | |
| | 249 | '''1. 上傳字典檔,以供!WordCount程序作字數統計的來源檔''' |
| | 250 | || || 指令 || |
| | 251 | || $ || cd /opt/hadoop/ || |
| | 252 | || $ || wget xxx/132.txt || |
| | 253 | || $ || bin/hadoop dfs -mkdir input || |
| | 254 | || $ || bin/hadoop dfs -put 132.txt input || |
| | 255 | || $ || bin/hadoop dfs -ls || |
| | 256 | {{{ |
| | 257 | Found 1 items |
| | 258 | /user/waue/input <dir> 2008-05-23 15:15 rwxr-xr-x waue supergroup |
| | 259 | }}} |
| | 260 | |
| | 261 | |
| | 262 | |
| | 263 | '''2. 在eclipse中新增程式碼''' |
| | 264 | 於Eclipse左邊的'''Project explorer'''內,我們可以看到剛剛設定的工作目錄'''sample'''。 我們新增一個範例程式: |
| | 265 | |
| | 266 | || 視窗操作 || 介面中設定 || 註解 || |
| | 267 | || '''Project explorer'''內,右鍵點選 '''sample'''> '''new''' > '''file''' || '''file name''' : !WordCount.java || 新增一個!WordCount.java檔案 || |
| | 268 | |
| | 269 | '''3. 撰寫程式''' |
| | 270 | 貼上 [http://trac.nchc.org.tw/cloud/attachment/wiki/hadoop-sample-code/WordCount.java?format=raw WordCount.java] 的內容。 |
| | 271 | |
| | 272 | '''4. 執行''' |
| | 273 | |
| | 274 | || 視窗操作 || 介面中設定 || 註解 || |
| | 275 | || '''Project explorer'''內,右鍵點選 '''!WordCount.java''' > '''run as ...''' > '''choose an existing server from the list below''' || '''finish''' || 執行!MapReduce || |
| | 276 | |
| | 277 | '''5. 執行畫面會出現在右下方的視窗:console ''' |
| | 278 | |
| | 279 | '''6. Hadoop的運算結果置於HDFS內的檔案中,要觀看結果內容,有以下三種方法 ''' |
| | 280 | * 用瀏覽器: 網址輸入 http://localhost:50070,點檔案目錄, |
| | 281 | * 直接用HDFS的指令秀出結果內容 |
| | 282 | |
| | 283 | * 用指令將輸出結果從HDFS內複製到本機資料夾內再觀看 |
| | 284 | |
| | 285 | '''7. 執行期間或執行結束,都可以到以下網址來看執行過程及結果 ''' |
| | 286 | || 網址 || 說明 || |
| | 287 | || http://localhost:50030 || hadoop-master || |
| | 288 | || http://localhost:50060 || hadoop-jobTracker || |
| | 289 | || http://localhost:50070 || hadoop-fileSystem || |
| | 290 | |
| | 291 | |
| | 292 | = 五、 參考 = |
| | 293 | |
| | 294 | * NCHC Cloud Technique Develop Group |
| | 295 | * http://trac.nchc.org.tw/cloud/ |
| | 296 | * IBM Map-Reduce |
| | 297 | * http://www.alphaworks.ibm.com/tech/mapreducetools |
| | 298 | * Cloud9 |
| | 299 | * http://www.umiacs.umd.edu/~jimmylin/cloud9/umd-hadoop-dist/cloud9-docs/howto/start.html |
| | 300 | * Runing Hadoop |
| | 301 | * http://www.michael-noll.com/wiki/Running_Hadoop_On_Ubuntu_Linux_%28Single-Node_Cluster%29 |