#!/bin/bash source install_lang ####### garbage here ############# function mainFunction ( ) { echo "$Good" } function braBraBra ( ) { echo "$Bra_Bra_Bra" } ####### garbage end ############### ####### fafa code here ########### # 參數假設 # /home/nutchuser/NutchEZ_source下有3個檔案 # install.sh, nutch-1.0.tar.gz, apache-tomcat-6.0.18.tar.gz # 安裝路徑為/opt/NutchEZ Install_source=/home/nutchuser/NutchEZ_source NutchEZ_HOME=/opt/NutchEZ MasterIP_Address=`/sbin/ifconfig eth0 | grep 'inet addr' | sed 's/^.*addr://g' | sed 's/Bcast.*$//g' | sed 's/ .*// '` set_install_information () { read -p "Please enter administrator's e-mail address: " Admin_email read -p "Please enter the Master DNS: " MasterDNS } show_info () { echo "Administrator's e-mail address is $Admin_email." echo "The master DNS is: $MasterDNS" } confirm_install_information () { read -p "Please confirm your install infomation: 1.Yes 2.No " confirm } # set $NutchEZ_HOME/conf/hadoop-env.sh set_hadoop-env () { echo "set $NutchEZ_HOME/conf/hadoop-env.sh" cd $NutchEZ_HOME/conf/ cat >> hadoop-env.sh << EOF export JAVA_HOME=/usr/lib/jvm/java-6-sun export HADOOP_HOME=$NutchEZ_HOME export HADOOP_LOG_DIR=/tmp/NutchEZ/logs export HADOOP_SLAVES=$NutchEZ_HOME/conf/slaves export HADOOP_CONF_DIR=$NutchEZ_HOME/conf export HADOOP_PID_DIR=/tmp/hadoop/pid export NUTCH_HOME=$NutchEZ_HOME export NUTCH_CONF_DIR=$NutchEZ_HOME/conf EOF } # set $NutchEZ_HOME/conf/hadoop-site.xml set_haoop-site () { echo "set $NutchEZ_HOME/conf/hadoop-site.xml" cd $NutchEZ_HOME/conf/ cat > hadoop-site.xml << EOF fs.default.name $MasterDNS:9000 The name of the default file system. Either the literal string "local" or a host:port for NDFS. mapred.job.tracker $MasterDNS:9001 The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task. EOF } set_nutch-site () { echo "set $NutchEZ_HOME/conf/nutch-site.xml" cd $NutchEZ_HOME/conf/ cat > nutch-site.xml << EOF http.agent.name nutchuser HTTP 'User-Agent' request header. http.agent.description MyTest Further description http.agent.url $MasterDNS A URL to advertise in the User-Agent header. $MasterDNS $Admin_email An email address EOF } set_crawl-urlfilter () { echo "set $NutchEZ_HOME/conf/set_crawl-urlfilter.txt" Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip file:, ftp:, & mailto: urls' | sed 's/:.*//g'` sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt sed -i ''$Line_NO'a -^(ftp|mailto):' $NutchEZ_HOME/conf/crawl-urlfilter.txt Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip image and other suffixes we can' | sed 's/:.*//g'` sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt sed -i ''$Line_NO'a -\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$' $NutchEZ_HOME/conf/crawl-urlfilter.txt Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip URLs containing certain characters as probable queries, etc.' | sed 's/:.*//g'` sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt sed -i ''$Line_NO'a -[*!@]' $NutchEZ_HOME/conf/crawl-urlfilter.txt Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip everything else' | sed 's/:.*//g'` sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt sed -i ''$Line_NO'a +.*' $NutchEZ_HOME/conf/crawl-urlfilter.txt sed -i ''$Line_NO'a # accecpt anything else' $NutchEZ_HOME/conf/crawl-urlfilter.txt } format_HDFS () { echo "format HDFS..." $NutchEZ_HOME/bin/hadoop namenode -format } start_up_NutchEZ (){ echo "start up NutchEZ..." $NutchEZ_HOME/bin/start-all.sh } set_server () { echo "$NutchEZ_HOME/tomcat/conf/server.xml" Line_NO=`cat $NutchEZ_HOME'/tomcat/conf/server.xml' | grep -n '' | sed 's/:.*//g'` sed -i ''$((Line_NO+1))','$((Line_NO+6))'d' $NutchEZ_HOME/tomcat/conf/server.xml sed -i ''$Line_NO'a \ ' $NutchEZ_HOME/tomcat/conf/server.xml } set_nutch-site2 () { echo "$NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml" # 搜尋加入設定的行號位址 line_NO=`cat $NutchEZ_HOME'/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml' | grep -n '<'configuration'>' | sed 's/:.*//g'` # 加入設定檔 sed -i ''$line_NO'a \ http.agent.name\ nutch\ HTTP 'User-Agent' request header. \ \ \ http.agent.description\ MyTest\ Further description \ \ \ http.agent.url \ localhost \ A URL to advertise in the User-Agent header. \ \ \ http.agent.email\ '$Admin_email' \ An email address \ \ \ \ plugin.folders\ '$NutchEZ_HOME'/plugins\ Directories where nutch plugins are located. \ \ \ plugin.includes\ protocol-(http|httpclient)|urlfilter-regex|parse-(text|html|js|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|swf|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)\ Regular expression naming plugin directory names\ \ \ parse.plugin.file\ parse-plugins.xml\ The name of the file that defines the associations between\ content-types and parsers.\ \ \ db.max.outlinks.per.page\ -1\ \ \ \ http.content.limit \ -1\ \ \ indexer.mergeFactor\ 500\ The factor that determines the frequency of Lucene segment\ merges. This must not be less than 2, higher values increase indexing\ speed but lead to increased RAM usage, and increase the number of\ open file handles (which may lead to "Too many open files" errors).\ NOTE: the "segments" here have nothing to do with Nutch segments, they\ are a low-level data unit used by Lucene.\ \ \ \ indexer.minMergeDocs\ 500\ This number determines the minimum number of Lucene\ Documents buffered in memory between Lucene segment merges. Larger\ values increase indexing speed and increase RAM usage.\ \ \ ' $NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml } set_Nutch_conf () { set_hadoop-env set_haoop-site set_nutch-site set_crawl-urlfilter } Install_Nutch () { cd /opt tar zxf /opt/nutch-1.0.tar.gz # tar zxvf /opt/nutch-1.0.tar.gz mv /opt/nutch-1.0 NutchEZ chown -R nutchuser:nutchuser $NutchEZ_HOME set_Nutch_conf } # install tomcat Install_Tomcat () { cd /opt/ # tar zxvf apache-tomcat-6.0.18.tar.gz tar zxf apache-tomcat-6.0.18.tar.gz mv apache-tomcat-6.0.18 $NutchEZ_HOME cd $NutchEZ_HOME mv apache-tomcat-6.0.18 tomcat mkdir web # mkdir $NutchEZ_HOME/search chown -R nutchuser:nutchuser $NutchEZ_HOME jar -xvf nutch-1.0.war web mv $NutchEZ_HOME/tomcat/webapps/ROOT $NutchEZ_HOME/tomcat/webapps/ROOT-ori mv $NutchEZ_HOME/web $NutchEZ_HOME/tomcat/webapps/ROOT set_server #set_nutch-site2 } start_up_tomcat () { echo "start up tomcat..." $NutchEZ_HOME/tomcat/bin/startup.sh }