#!/bin/bash
source install_lang
####### garbage here #############
function mainFunction ( )
{
echo "$Good"
}
function braBraBra ( )
{
echo "$Bra_Bra_Bra"
}
####### garbage end ###############
####### fafa code here ###########
# 參數假設
# /home/nutchuser/NutchEZ_source下有3個檔案
# install.sh, nutch-1.0.tar.gz, apache-tomcat-6.0.18.tar.gz
# 安裝路徑為/opt/NutchEZ
Install_source=/home/nutchuser/NutchEZ_source
NutchEZ_HOME=/opt/NutchEZ
MasterIP_Address=`/sbin/ifconfig eth0 | grep 'inet addr' | sed 's/^.*addr://g' | sed 's/Bcast.*$//g' | sed 's/ .*// '`
set_install_information () {
read -p "Please enter administrator's e-mail address: " Admin_email
read -p "Please enter the Master DNS: " MasterDNS
}
show_info () {
echo "Administrator's e-mail address is $Admin_email."
echo "The master DNS is: $MasterDNS"
}
confirm_install_information () {
read -p "Please confirm your install infomation: 1.Yes 2.No " confirm
}
set_Nutch_conf () {
set_hadoop-env
set_haoop-site
set_nutch-site
set_crawl-urlfilter
}
# set $NutchEZ_HOME/conf/hadoop-env.sh
set_hadoop-env () {
echo "set $NutchEZ_HOME/conf/hadoop-env.sh"
cd $NutchEZ_HOME/conf/
cat >> hadoop-env.sh << EOF
export JAVA_HOME=/usr/lib/jvm/java-6-sun
export HADOOP_HOME=$NutchEZ_HOME
export HADOOP_LOG_DIR=/tmp/NutchEZ/logs
export HADOOP_SLAVES=$NutchEZ_HOME/conf/slaves
export HADOOP_CONF_DIR=$NutchEZ_HOME/conf
export HADOOP_PID_DIR=/tmp/hadoop/pid
export NUTCH_HOME=$NutchEZ_HOME
export NUTCH_CONF_DIR=$NutchEZ_HOME/conf
EOF
}
# set $NutchEZ_HOME/conf/hadoop-site.xml
set_haoop-site () {
echo "set $NutchEZ_HOME/conf/hadoop-site.xml"
cd $NutchEZ_HOME/conf/
cat > hadoop-site.xml << EOF
fs.default.name
$MasterDNS:9000
The name of the default file system. Either the literal string "local" or a host:port for NDFS.
mapred.job.tracker
$MasterDNS:9001
The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task.
EOF
}
set_nutch-site () {
echo "set $NutchEZ_HOME/conf/nutch-site.xml"
cd $NutchEZ_HOME/conf/
cat > nutch-site.xml << EOF
http.agent.name
nutchuser
HTTP 'User-Agent' request header.
http.agent.description
MyTest
Further description
http.agent.url
$MasterDNS
A URL to advertise in the User-Agent header.
$MasterDNS
$Admin_email
An email address
EOF
}
set_crawl-urlfilter () {
echo "set $NutchEZ_HOME/conf/set_crawl-urlfilter.txt"
Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip file:, ftp:, & mailto: urls' | sed 's/:.*//g'`
sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
sed -i ''$Line_NO'a -^(ftp|mailto):' $NutchEZ_HOME/conf/crawl-urlfilter.txt
Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip image and other suffixes we can' | sed 's/:.*//g'`
sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
sed -i ''$Line_NO'a -\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$' $NutchEZ_HOME/conf/crawl-urlfilter.txt
Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip URLs containing certain characters as probable queries, etc.' | sed 's/:.*//g'`
sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
sed -i ''$Line_NO'a -[*!@]' $NutchEZ_HOME/conf/crawl-urlfilter.txt
Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip everything else' | sed 's/:.*//g'`
sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
sed -i ''$Line_NO'a +.*' $NutchEZ_HOME/conf/crawl-urlfilter.txt
sed -i ''$Line_NO'a # accecpt anything else' $NutchEZ_HOME/conf/crawl-urlfilter.txt
}
format_HDFS () {
echo "format HDFS..."
$NutchEZ_HOME/bin/hadoop namenode -format
}
start_up_NutchEZ (){
echo "start up NutchEZ..."
$NutchEZ_HOME/bin/start-all.sh
}
set_server () {
echo "$NutchEZ_HOME/tomcat/conf/server.xml"
Line_NO=`cat $NutchEZ_HOME'/tomcat/conf/server.xml' | grep -n '' | sed 's/:.*//g'`
sed -i ''$((Line_NO+1))','$((Line_NO+6))'d' $NutchEZ_HOME/tomcat/conf/server.xml
sed -i ''$Line_NO'a \
' $NutchEZ_HOME/tomcat/conf/server.xml
}
set_nutch-site2 () {
echo "$NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml"
# 搜尋加入設定的行號位址
line_NO=`cat $NutchEZ_HOME'/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml' | grep -n '<'configuration'>' | sed 's/:.*//g'`
# 加入設定檔
sed -i ''$line_NO'a \
http.agent.name\
nutch\
HTTP 'User-Agent' request header. \
\
\
http.agent.description\
MyTest\
Further description \
\
\
http.agent.url \
localhost \
A URL to advertise in the User-Agent header. \
\
\
http.agent.email\
'$Admin_email' \
An email address \
\
\
\
plugin.folders\
'$NutchEZ_HOME'/plugins\
Directories where nutch plugins are located. \
\
\
plugin.includes\
protocol-(http|httpclient)|urlfilter-regex|parse-(text|html|js|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|swf|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)\
Regular expression naming plugin directory names\
\
\
parse.plugin.file\
parse-plugins.xml\
The name of the file that defines the associations between\
content-types and parsers.\
\
\
db.max.outlinks.per.page\
-1\
\
\
\
http.content.limit \
-1\
\
\
indexer.mergeFactor\
500\
The factor that determines the frequency of Lucene segment\
merges. This must not be less than 2, higher values increase indexing\
speed but lead to increased RAM usage, and increase the number of\
open file handles (which may lead to "Too many open files" errors).\
NOTE: the "segments" here have nothing to do with Nutch segments, they\
are a low-level data unit used by Lucene.\
\
\
\
indexer.minMergeDocs\
500\
This number determines the minimum number of Lucene\
Documents buffered in memory between Lucene segment merges. Larger\
values increase indexing speed and increase RAM usage.\
\
\
' $NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml
}
Install_Nutch () {
cd /opt
tar zxf /opt/nutch-1.0.tar.gz
# tar zxvf /opt/nutch-1.0.tar.gz
mv /opt/nutch-1.0 NutchEZ
chown -R nutchuser:nutchuser $NutchEZ_HOME
set_Nutch_conf
}
# install tomcat
Install_Tomcat () {
cd /opt/
# tar zxvf apache-tomcat-6.0.18.tar.gz
tar zxf apache-tomcat-6.0.18.tar.gz
mv apache-tomcat-6.0.18 $NutchEZ_HOME
cd $NutchEZ_HOME
mv apache-tomcat-6.0.18 tomcat
mkdir web
# mkdir $NutchEZ_HOME/search
chown -R nutchuser:nutchuser $NutchEZ_HOME
jar -xvf nutch-1.0.war web
mv $NutchEZ_HOME/tomcat/webapps/ROOT $NutchEZ_HOME/tomcat/webapps/ROOT-ori
mv $NutchEZ_HOME/web $NutchEZ_HOME/tomcat/webapps/ROOT
set_server
#set_nutch-site2
}
start_up_tomcat () {
echo "start up tomcat..."
$NutchEZ_HOME/tomcat/bin/startup.sh
}