[116] | 1 | #!/bin/bash |
---|
| 2 | source install_lang |
---|
[117] | 3 | ####### garbage here ############# |
---|
[116] | 4 | function mainFunction ( ) |
---|
| 5 | { |
---|
| 6 | echo "$Good" |
---|
| 7 | } |
---|
| 8 | function braBraBra ( ) |
---|
| 9 | { |
---|
| 10 | echo "$Bra_Bra_Bra" |
---|
| 11 | } |
---|
[117] | 12 | ####### garbage end ############### |
---|
| 13 | |
---|
| 14 | |
---|
| 15 | |
---|
| 16 | ####### fafa code here ########### |
---|
| 17 | |
---|
[125] | 18 | # 參數假設 |
---|
| 19 | # /home/nutchuser/NutchEZ_source下有3個檔案 |
---|
| 20 | # install.sh, nutch-1.0.tar.gz, apache-tomcat-6.0.18.tar.gz |
---|
[130] | 21 | # 安裝路徑為/opt/NutchEZ |
---|
[125] | 22 | |
---|
| 23 | Install_source=/home/nutchuser/NutchEZ_source |
---|
[130] | 24 | NutchEZ_HOME=/opt/NutchEZ |
---|
| 25 | MasterIP_Address=`/sbin/ifconfig eth0 | grep 'inet addr' | sed 's/^.*addr://g' | sed 's/Bcast.*$//g' | sed 's/ .*// '` |
---|
[125] | 26 | |
---|
| 27 | |
---|
[117] | 28 | set_install_information () { |
---|
[130] | 29 | read -p "Please enter administrator's e-mail address: " Admin_email |
---|
[125] | 30 | read -p "Please enter the Master DNS: " MasterDNS |
---|
[117] | 31 | } |
---|
| 32 | |
---|
| 33 | show_info () { |
---|
[125] | 34 | echo "Administrator's e-mail address is $Admin_email." |
---|
| 35 | echo "The master DNS is: $MasterDNS" |
---|
[117] | 36 | } |
---|
| 37 | |
---|
| 38 | confirm_install_information () { |
---|
| 39 | read -p "Please confirm your install infomation: 1.Yes 2.No " confirm |
---|
| 40 | } |
---|
| 41 | |
---|
[125] | 42 | set_Nutch_conf () { |
---|
[117] | 43 | set_hadoop-env |
---|
[125] | 44 | set_haoop-site |
---|
| 45 | set_nutch-site |
---|
| 46 | set_crawl-urlfilter |
---|
[117] | 47 | } |
---|
| 48 | |
---|
[125] | 49 | # set $NutchEZ_HOME/conf/hadoop-env.sh |
---|
[117] | 50 | set_hadoop-env () { |
---|
[125] | 51 | echo "set $NutchEZ_HOME/conf/hadoop-env.sh" |
---|
[130] | 52 | cd $NutchEZ_HOME/conf/ |
---|
[125] | 53 | cat >> hadoop-env.sh << EOF |
---|
| 54 | export JAVA_HOME=/usr/lib/jvm/java-6-sun |
---|
[131] | 55 | export HADOOP_HOME=$NutchEZ_HOME |
---|
[130] | 56 | export HADOOP_LOG_DIR=/tmp/NutchEZ/logs |
---|
[131] | 57 | export HADOOP_SLAVES=$NutchEZ_HOME/conf/slaves |
---|
| 58 | export HADOOP_CONF_DIR=$NutchEZ_HOME/conf |
---|
| 59 | export HADOOP_PID_DIR=/tmp/hadoop/pid |
---|
| 60 | export NUTCH_HOME=$NutchEZ_HOME |
---|
| 61 | export NUTCH_CONF_DIR=$NutchEZ_HOME/conf |
---|
[125] | 62 | EOF |
---|
[117] | 63 | } |
---|
| 64 | |
---|
[125] | 65 | # set $NutchEZ_HOME/conf/hadoop-site.xml |
---|
[130] | 66 | set_haoop-site () { |
---|
[125] | 67 | echo "set $NutchEZ_HOME/conf/hadoop-site.xml" |
---|
[130] | 68 | cd $NutchEZ_HOME/conf/ |
---|
| 69 | cat > hadoop-site.xml << EOF |
---|
[125] | 70 | <configuration> |
---|
| 71 | <property> |
---|
| 72 | <name>fs.default.name</name> |
---|
| 73 | <value>$MasterDNS:9000</value> |
---|
| 74 | <description> The name of the default file system. Either the literal string "local" or a host:port for NDFS. </description> |
---|
| 75 | </property> |
---|
| 76 | <property> |
---|
| 77 | <name>mapred.job.tracker</name> |
---|
| 78 | <value>$MasterDNS:9001</value> |
---|
| 79 | <description> The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task. </description> |
---|
| 80 | </property> |
---|
| 81 | </configuration> |
---|
| 82 | EOF |
---|
[117] | 83 | } |
---|
| 84 | |
---|
[125] | 85 | set_nutch-site () { |
---|
| 86 | echo "set $NutchEZ_HOME/conf/nutch-site.xml" |
---|
[130] | 87 | cd $NutchEZ_HOME/conf/ |
---|
| 88 | cat > nutch-site.xml << EOF |
---|
[125] | 89 | <configuration> |
---|
| 90 | <property> |
---|
| 91 | <name>http.agent.name</name> |
---|
| 92 | <value>nutchuser</value> |
---|
| 93 | <description>HTTP 'User-Agent' request header. </description> |
---|
| 94 | </property> |
---|
| 95 | <property> |
---|
| 96 | <name>http.agent.description</name> |
---|
| 97 | <value>MyTest</value> |
---|
| 98 | <description>Further description</description> |
---|
| 99 | </property> |
---|
| 100 | <property> |
---|
| 101 | <name>http.agent.url</name> |
---|
| 102 | <value>$MasterDNS</value> |
---|
| 103 | <description>A URL to advertise in the User-Agent header. </description> |
---|
| 104 | </property> |
---|
| 105 | <property> |
---|
| 106 | <name>$MasterDNS</name> |
---|
| 107 | <value>$Admin_email</value> |
---|
| 108 | <description>An email address |
---|
| 109 | </description> |
---|
| 110 | </property> |
---|
| 111 | </configuration> |
---|
| 112 | EOF |
---|
[117] | 113 | } |
---|
| 114 | |
---|
[126] | 115 | |
---|
[117] | 116 | set_crawl-urlfilter () { |
---|
[130] | 117 | echo "set $NutchEZ_HOME/conf/set_crawl-urlfilter.txt" |
---|
| 118 | Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip file:, ftp:, & mailto: urls' | sed 's/:.*//g'` |
---|
| 119 | sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt |
---|
| 120 | sed -i ''$Line_NO'a -^(ftp|mailto):' $NutchEZ_HOME/conf/crawl-urlfilter.txt |
---|
| 121 | |
---|
| 122 | |
---|
| 123 | Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip image and other suffixes we can' | sed 's/:.*//g'` |
---|
| 124 | sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt |
---|
| 125 | sed -i ''$Line_NO'a -\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$' $NutchEZ_HOME/conf/crawl-urlfilter.txt |
---|
| 126 | |
---|
| 127 | |
---|
| 128 | Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip URLs containing certain characters as probable queries, etc.' | sed 's/:.*//g'` |
---|
| 129 | sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt |
---|
| 130 | sed -i ''$Line_NO'a -[*!@]' $NutchEZ_HOME/conf/crawl-urlfilter.txt |
---|
| 131 | |
---|
| 132 | |
---|
| 133 | Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip everything else' | sed 's/:.*//g'` |
---|
| 134 | sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt |
---|
| 135 | sed -i ''$Line_NO'a +.*' $NutchEZ_HOME/conf/crawl-urlfilter.txt |
---|
| 136 | sed -i ''$Line_NO'a # accecpt anything else' $NutchEZ_HOME/conf/crawl-urlfilter.txt |
---|
[117] | 137 | } |
---|
| 138 | |
---|
[125] | 139 | format_HDFS () { |
---|
| 140 | echo "format HDFS..." |
---|
| 141 | $NutchEZ_HOME/bin/hadoop namenode -format |
---|
[117] | 142 | } |
---|
| 143 | |
---|
| 144 | start_up_NutchEZ (){ |
---|
[125] | 145 | echo "start up NutchEZ..." |
---|
| 146 | $NutchEZ_HOME/bin/start-all.sh |
---|
[117] | 147 | } |
---|
| 148 | |
---|
[125] | 149 | set_server () { |
---|
| 150 | echo "$NutchEZ_HOME/tomcat/conf/server.xml" |
---|
[130] | 151 | Line_NO=`cat $NutchEZ_HOME'/tomcat/conf/server.xml' | grep -n '<!-- A "Connector" using the shared thread pool-->' | sed 's/:.*//g'` |
---|
| 152 | |
---|
| 153 | sed -i ''$((Line_NO+1))','$((Line_NO+6))'d' $NutchEZ_HOME/tomcat/conf/server.xml |
---|
| 154 | sed -i ''$Line_NO'a <Connector port="8080" protocol="HTTP/1.1"\ |
---|
[126] | 155 | connectionTimeout="20000"\ |
---|
[130] | 156 | redirectPort="8443" URIEncoding="UTF-8"\ |
---|
| 157 | useBodyEncodingForURI="true" />\ |
---|
| 158 | ' $NutchEZ_HOME/tomcat/conf/server.xml |
---|
[117] | 159 | } |
---|
| 160 | |
---|
[131] | 161 | |
---|
| 162 | set_nutch-site2 () { |
---|
[125] | 163 | echo "$NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml" |
---|
[126] | 164 | |
---|
| 165 | # 搜尋加入設定的行號位址 |
---|
[131] | 166 | line_NO=`cat $NutchEZ_HOME'/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml' | grep -n '<'configuration'>' | sed 's/:.*//g'` |
---|
[126] | 167 | |
---|
| 168 | # 加入設定檔 |
---|
[131] | 169 | sed -i ''$line_NO'a <property>\ |
---|
[126] | 170 | <name>http.agent.name</name>\ |
---|
[131] | 171 | <value>nutch</value>\ |
---|
| 172 | <description>HTTP 'User-Agent' request header. </description> \ |
---|
[126] | 173 | </property>\ |
---|
| 174 | <property>\ |
---|
| 175 | <name>http.agent.description</name>\ |
---|
| 176 | <value>MyTest</value>\ |
---|
[131] | 177 | <description>Further description</description> \ |
---|
[126] | 178 | </property>\ |
---|
| 179 | <property>\ |
---|
[131] | 180 | <name>http.agent.url</name> \ |
---|
| 181 | <value>localhost</value> \ |
---|
| 182 | <description>A URL to advertise in the User-Agent header. </description> \ |
---|
[126] | 183 | </property>\ |
---|
| 184 | <property>\ |
---|
| 185 | <name>http.agent.email</name>\ |
---|
[131] | 186 | <value>'$Admin_email'</value> \ |
---|
| 187 | <description>An email address \ |
---|
| 188 | </description> \ |
---|
| 189 | </property>\ |
---|
| 190 | <property>\ |
---|
| 191 | <name>plugin.folders</name>\ |
---|
| 192 | <value>'$NutchEZ_HOME'/plugins</value>\ |
---|
| 193 | <description>Directories where nutch plugins are located. </description>\ |
---|
| 194 | </property>\ |
---|
| 195 | <property>\ |
---|
| 196 | <name>plugin.includes</name>\ |
---|
| 197 | <value>protocol-(http|httpclient)|urlfilter-regex|parse-(text|html|js|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|swf|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>\ |
---|
| 198 | <description> Regular expression naming plugin directory names</description>\ |
---|
| 199 | </property>\ |
---|
| 200 | <property>\ |
---|
| 201 | <name>parse.plugin.file</name>\ |
---|
| 202 | <value>parse-plugins.xml</value>\ |
---|
| 203 | <description>The name of the file that defines the associations between\ |
---|
| 204 | content-types and parsers.</description>\ |
---|
| 205 | </property>\ |
---|
| 206 | <property>\ |
---|
| 207 | <name>db.max.outlinks.per.page</name>\ |
---|
| 208 | <value>-1</value>\ |
---|
| 209 | <description> </description>\ |
---|
| 210 | </property> \ |
---|
| 211 | <property>\ |
---|
| 212 | <name>http.content.limit</name> \ |
---|
| 213 | <value>-1</value>\ |
---|
| 214 | </property>\ |
---|
| 215 | <property>\ |
---|
| 216 | <name>indexer.mergeFactor</name>\ |
---|
| 217 | <value>500</value>\ |
---|
| 218 | <description>The factor that determines the frequency of Lucene segment\ |
---|
| 219 | merges. This must not be less than 2, higher values increase indexing\ |
---|
| 220 | speed but lead to increased RAM usage, and increase the number of\ |
---|
| 221 | open file handles (which may lead to "Too many open files" errors).\ |
---|
| 222 | NOTE: the "segments" here have nothing to do with Nutch segments, they\ |
---|
| 223 | are a low-level data unit used by Lucene.\ |
---|
[126] | 224 | </description>\ |
---|
| 225 | </property>\ |
---|
[131] | 226 | |
---|
| 227 | <property>\ |
---|
| 228 | <name>indexer.minMergeDocs</name>\ |
---|
| 229 | <value>500</value>\ |
---|
| 230 | <description>This number determines the minimum number of Lucene\ |
---|
| 231 | Documents buffered in memory between Lucene segment merges. Larger\ |
---|
| 232 | values increase indexing speed and increase RAM usage.\ |
---|
| 233 | </description>\ |
---|
| 234 | </property>\ |
---|
| 235 | |
---|
| 236 | ' $NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml |
---|
[117] | 237 | } |
---|
| 238 | |
---|
[131] | 239 | Install_Nutch () { |
---|
| 240 | cd /opt |
---|
| 241 | tar zxf /opt/nutch-1.0.tar.gz |
---|
| 242 | # tar zxvf /opt/nutch-1.0.tar.gz |
---|
| 243 | mv /opt/nutch-1.0 NutchEZ |
---|
| 244 | chown -R nutchuser:nutchuser $NutchEZ_HOME |
---|
| 245 | set_Nutch_conf |
---|
| 246 | } |
---|
[125] | 247 | |
---|
[131] | 248 | # install tomcat |
---|
| 249 | Install_Tomcat () { |
---|
| 250 | cd /opt/ |
---|
| 251 | # tar zxvf apache-tomcat-6.0.18.tar.gz |
---|
| 252 | tar zxf apache-tomcat-6.0.18.tar.gz |
---|
| 253 | mv apache-tomcat-6.0.18 $NutchEZ_HOME |
---|
| 254 | cd $NutchEZ_HOME |
---|
| 255 | mv apache-tomcat-6.0.18 tomcat |
---|
| 256 | mkdir web |
---|
| 257 | # mkdir $NutchEZ_HOME/search |
---|
| 258 | chown -R nutchuser:nutchuser $NutchEZ_HOME |
---|
| 259 | jar -xvf nutch-1.0.war web |
---|
| 260 | mv $NutchEZ_HOME/tomcat/webapps/ROOT $NutchEZ_HOME/tomcat/webapps/ROOT-ori |
---|
| 261 | mv $NutchEZ_HOME/web $NutchEZ_HOME/tomcat/webapps/ROOT |
---|
| 262 | set_server |
---|
| 263 | #set_nutch-site2 |
---|
| 264 | } |
---|
| 265 | |
---|
[117] | 266 | start_up_tomcat () { |
---|
[125] | 267 | echo "start up tomcat..." |
---|
| 268 | $NutchEZ_HOME/tomcat/bin/startup.sh |
---|
[117] | 269 | } |
---|