= Deploy Hadoop to PC Classroom using DRBL =
* java is required for Hadoop, so you need to install java runtime or jdk first.
{{{
~$ echo "deb http://free.nchc.org.tw/debian/ etch non-free" > /tmp/etch-non-free.list
~$ sudo mv /tmp/etch-non-free.list /etc/apt/sources.list.d/.
~$ sudo apt-get update
~$ sudo apt-get install sun-java5-jdk
}}}
* download Hadoop 0.18.2
{{{
~$ wget http://ftp.twaren.net/Unix/Web/apache/hadoop/core/hadoop-0.18.2/hadoop-0.18.2.tar.gz
~$ tar zxvf hadoop-0.18.2.tar.gz
}}}
* setup JAVA_HOME environment variable
{{{
~$ echo "export JAVA_HOME=/usr/lib/jvm/java-1.5.0-sun" >> ~/.bash_profile
~$ source ~/.bash_profile
}}}
* edit hadoop-0.18.2/conf/hadoop-env.sh
{{{
#!diff
--- hadoop-0.18.2/conf/hadoop-env.sh.org 2008-11-06 22:57:40.000000000 +0800
+++ hadoop-0.18.2/conf/hadoop-env.sh 2008-11-06 22:58:42.000000000 +0800
@@ -6,7 +6,9 @@
# remote nodes.
# The java implementation to use. Required.
-# export JAVA_HOME=/usr/lib/j2sdk1.5-sun
+export JAVA_HOME=/usr/lib/jvm/java-1.5.0-sun
+export HADOOP_HOME=/home/jazz/hadoop-0.18.2
+export HADOOP_CONF_DIR=$HADOOP_HOME/conf
# Extra Java CLASSPATH elements. Optional.
# export HADOOP_CLASSPATH=
}}}
* here is current DRBL setup
{{{
******************************************************
NIC NIC IP Clients
+------------------------------+
| DRBL SERVER |
| |
| +-- [eth0] X.X.X.X +- to WAN
| |
| +-- [eth1] 192.168.61.254 +- to clients group 1 [ 16 clients, their IP
| | from 192.168.61.1 - 192.168.61.16]
+------------------------------+
******************************************************
Total clients: 16
******************************************************
}}}
* Hadoop will use ssh connections for internal connection, thus we have to do SSH key exchange.
{{{
~$ ssh-keygen
~$ cp .ssh/id_rsa.pub .ssh/authorized_keys
~$ sudo apt-get install dsh
~$ mkdir -p .dsh
~$ for ((i=1;i<=16;i++)); do echo "192.168.61.$i" >> .dsh/machines.list; done
}}}
* edit hadoop-0.18.2/conf/hadoop-site.xml
{{{
#!diff
--- hadoop-0.18.2/conf/hadoop-site.xml.org 2008-11-06 23:11:18.000000000 +0800
+++ hadoop-0.18.2/conf/hadoop-site.xml 2008-11-07 17:05:11.000000000 +0800
@@ -4,5 +4,31 @@
-
+
+ fs.default.name
+ hdfs://192.168.61.254:9000/
+
+ The name of the default file system. Either the literal string
+ "local" or a host:port for NDFS.
+
+
+
+ mapred.job.tracker
+ 192.168.61.254:9001
+
+ The host and port that the MapReduce job tracker runs at. If
+ "local", then jobs are run in-process as a single map and
+ reduce task.
+
+
}}}
* edit /etc/rc.local for DRBL Server as Hadoop namenode
{{{
#!diff
--- /etc/rc.local.org 2008-11-07 18:09:10.000000000 +0800
+++ /etc/rc.local 2008-11-07 17:58:14.000000000 +0800
@@ -11,4 +11,7 @@
#
# By default this script does nothing.
+echo 3 > /proc/sys/vm/drop_caches
+/home/jazz/hadoop-0.18.2/bin/hadoop namenode -format
+/home/jazz/hadoop-0.18.2/bin/hadoop-daemon.sh start namenode
+/home/jazz/hadoop-0.18.2/bin/hadoop-daemon.sh start jobtracker
+/home/jazz/hadoop-0.18.2/bin/hadoop-daemon.sh start tasktracker
exit 0
}}}
* edit hadoop_datanode for DRBL client as datanode
{{{
~$ cat > hadoop_datanode << EOF
}}}
{{{
#! /bin/sh
set -e
# /etc/init.d/hadoop_datanode: start and stop Hadoop DFS datanode for DRBL Client
export PATH="${PATH:+$PATH:}/usr/sbin:/sbin"
case "\$1" in
start)
echo -n "starting datanode:"
/home/jazz/hadoop-0.18.2/bin/hadoop-daemon.sh start datanode
echo "[OK]"
;;
stop)
echo -n "stoping datanode:"
/home/jazz/hadoop-0.18.2/bin/hadoop-daemon.sh stop datanode
echo "[OK]"
;;
*)
echo "Usage: /etc/init.d/hadoop_datanode {start|stop}"
exit 1
esac
exit 0
EOF
}}}
{{{
~$ chmod a+x hadoop_datanode
~$ sudo /opt/drbl/sbin/drbl-cp-host hadoop_datanode /etc/init.d/
~$ sudo /opt/drbl/bin/drbl-doit update-rc.d hadoop_datanode defaults 99
}}}
* shutdown DRBL clients
* reboot DRBL server
* use "Wake on LAN" for DRBL clients
* browse http://192.168.61.254:50070 for DFS status
== Known Issue ==
* 如果要讓 DRBL Clients 都掛起硬碟來存資料,hadoop-site.xml 必須加入
{{{
dfs.data.dir
預採用的路徑(Ex:/home/jazz/hadoop)
true
}}}
* 記得 namenode 要 format 否則會無法連接上 namenode
* 有掛載硬碟時,當 datanode 第二次啟動,會出現錯誤訊息
{{{
2009-03-21 01:03:49,814 INFO org.apache.hadoop.dfs.Storage: Cannot lock storage /home/jazz/hadoop. The directory is already locked.
2009-03-21 01:03:49,915 ERROR org.apache.hadoop.dfs.DataNode: java.io.IOException: Cannot lock storage /home/jazz/hadoop. The directory is already locked.
at org.apache.hadoop.dfs.Storage$StorageDirectory.lock(Storage.java:447)
at org.apache.hadoop.dfs.Storage$StorageDirectory.analyzeStorage(Storage.java:300)
at org.apache.hadoop.dfs.DataStorage.recoverTransitionRead(DataStorage.java:105)
at org.apache.hadoop.dfs.DataNode.startDataNode(DataNode.java:306)
at org.apache.hadoop.dfs.DataNode.(DataNode.java:223)
at org.apache.hadoop.dfs.DataNode.makeInstance(DataNode.java:3071)
at org.apache.hadoop.dfs.DataNode.instantiateDataNode(DataNode.java:3026)
at org.apache.hadoop.dfs.DataNode.createDataNode(DataNode.java:3034)
at org.apache.hadoop.dfs.DataNode.main(DataNode.java:3156)
2009-03-21 01:03:49,915 INFO org.apache.hadoop.dfs.DataNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down DataNode at pc144/192.168.100.44
************************************************************/
}}}