source: nutchez-0.2/src/test/install_func.sh @ 136

Last change on this file since 136 was 136, checked in by shunfa, 14 years ago

install流程修改

  • Property svn:executable set to *
File size: 8.4 KB
RevLine 
[116]1#!/bin/bash
2source install_lang
[117]3####### garbage here #############
[116]4function mainFunction ( )
5{
6echo "$Good"
7}
8function braBraBra ( )
9{
10echo "$Bra_Bra_Bra"
11}
[117]12####### garbage end ###############
13
14
15
16####### fafa code here ###########
17
[125]18# 參數假設
19# /home/nutchuser/NutchEZ_source下有3個檔案
20# install.sh, nutch-1.0.tar.gz, apache-tomcat-6.0.18.tar.gz
[130]21# 安裝路徑為/opt/NutchEZ
[125]22
23Install_source=/home/nutchuser/NutchEZ_source
[130]24NutchEZ_HOME=/opt/NutchEZ
25MasterIP_Address=`/sbin/ifconfig eth0 | grep 'inet addr' |  sed 's/^.*addr://g' | sed 's/Bcast.*$//g' | sed 's/ .*// '`
[125]26
27
[117]28set_install_information () {
[130]29  read -p "Please enter administrator's e-mail address:  " Admin_email
[125]30  read -p "Please enter the Master DNS:  " MasterDNS
[117]31}
32
33show_info () {
[125]34  echo "Administrator's e-mail address is $Admin_email."
35  echo "The master DNS is: $MasterDNS"
[117]36}
37
38confirm_install_information () {
39  read -p "Please confirm your install infomation: 1.Yes 2.No  " confirm
40}
41
[125]42# set $NutchEZ_HOME/conf/hadoop-env.sh
[117]43set_hadoop-env () {
[125]44  echo "set $NutchEZ_HOME/conf/hadoop-env.sh"
[130]45  cd $NutchEZ_HOME/conf/
[125]46  cat >> hadoop-env.sh << EOF
47export JAVA_HOME=/usr/lib/jvm/java-6-sun
[131]48export HADOOP_HOME=$NutchEZ_HOME
[130]49export HADOOP_LOG_DIR=/tmp/NutchEZ/logs
[131]50export HADOOP_SLAVES=$NutchEZ_HOME/conf/slaves
51export HADOOP_CONF_DIR=$NutchEZ_HOME/conf
52export HADOOP_PID_DIR=/tmp/hadoop/pid
53export NUTCH_HOME=$NutchEZ_HOME
54export NUTCH_CONF_DIR=$NutchEZ_HOME/conf
[125]55EOF
[117]56}
57
[125]58# set $NutchEZ_HOME/conf/hadoop-site.xml
[130]59set_haoop-site () {
[125]60  echo "set $NutchEZ_HOME/conf/hadoop-site.xml"
[130]61  cd $NutchEZ_HOME/conf/
62  cat > hadoop-site.xml << EOF
[125]63<configuration>
64<property>
65    <name>fs.default.name</name>
66    <value>$MasterDNS:9000</value>
67    <description> The name of the default file system. Either the literal string "local" or a host:port for NDFS. </description>
68</property>
69<property>
70    <name>mapred.job.tracker</name>
71    <value>$MasterDNS:9001</value>
72    <description> The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task. </description>
73</property>
74</configuration>
75EOF
[117]76}
77
[125]78set_nutch-site () {
79  echo "set $NutchEZ_HOME/conf/nutch-site.xml"
[130]80  cd $NutchEZ_HOME/conf/
81  cat > nutch-site.xml << EOF
[125]82<configuration>
83<property>
84  <name>http.agent.name</name>
85  <value>nutchuser</value>
86  <description>HTTP 'User-Agent' request header. </description>
87</property>
88<property>
89  <name>http.agent.description</name>
90  <value>MyTest</value>
91  <description>Further description</description>
92</property>
93<property>
94  <name>http.agent.url</name>
95  <value>$MasterDNS</value>
96  <description>A URL to advertise in the User-Agent header. </description>
97</property>
98<property>
99  <name>$MasterDNS</name>
100  <value>$Admin_email</value>
101  <description>An email address
102  </description>
103</property>
104</configuration>
105EOF
[117]106}
107
[126]108
[117]109set_crawl-urlfilter () {
[130]110  echo "set $NutchEZ_HOME/conf/set_crawl-urlfilter.txt"
111  Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip file:, ftp:, & mailto: urls' | sed 's/:.*//g'`
112  sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
113  sed -i ''$Line_NO'a -^(ftp|mailto):' $NutchEZ_HOME/conf/crawl-urlfilter.txt
114
115
116  Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip image and other suffixes we can' | sed 's/:.*//g'`
117  sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
118  sed -i ''$Line_NO'a -\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$' $NutchEZ_HOME/conf/crawl-urlfilter.txt
119
120
121  Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip URLs containing certain characters as probable queries, etc.' | sed 's/:.*//g'`
122  sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
123  sed -i ''$Line_NO'a -[*!@]' $NutchEZ_HOME/conf/crawl-urlfilter.txt
124
125
126  Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip everything else' | sed 's/:.*//g'`
127  sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
128  sed -i ''$Line_NO'a +.*' $NutchEZ_HOME/conf/crawl-urlfilter.txt
129  sed -i ''$Line_NO'a # accecpt anything else' $NutchEZ_HOME/conf/crawl-urlfilter.txt
[117]130}
131
[125]132format_HDFS () {
133  echo "format HDFS..."
134  $NutchEZ_HOME/bin/hadoop namenode -format
[117]135}
136
137start_up_NutchEZ (){
[125]138  echo "start up NutchEZ..."
139  $NutchEZ_HOME/bin/start-all.sh
[117]140}
141
[125]142set_server () {
143  echo "$NutchEZ_HOME/tomcat/conf/server.xml"
[130]144  Line_NO=`cat $NutchEZ_HOME'/tomcat/conf/server.xml' | grep -n '<!-- A "Connector" using the shared thread pool-->' | sed 's/:.*//g'`
145
146  sed -i ''$((Line_NO+1))','$((Line_NO+6))'d' $NutchEZ_HOME/tomcat/conf/server.xml
147  sed -i ''$Line_NO'a    <Connector port="8080" protocol="HTTP/1.1"\
[126]148               connectionTimeout="20000"\
[130]149               redirectPort="8443" URIEncoding="UTF-8"\
150               useBodyEncodingForURI="true" />\
151' $NutchEZ_HOME/tomcat/conf/server.xml
[117]152}
153
[131]154
155set_nutch-site2 () {
[125]156  echo "$NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml"
[126]157 
158  # 搜尋加入設定的行號位址
[131]159  line_NO=`cat $NutchEZ_HOME'/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml' | grep -n '<'configuration'>' | sed 's/:.*//g'`
[126]160 
161  # 加入設定檔
[131]162  sed -i ''$line_NO'a  <property>\
[126]163  <name>http.agent.name</name>\
[131]164  <value>nutch</value>\
165  <description>HTTP 'User-Agent' request header. </description> \
[126]166</property>\
167<property>\
168  <name>http.agent.description</name>\
169  <value>MyTest</value>\
[131]170  <description>Further description</description> \
[126]171</property>\
172<property>\
[131]173  <name>http.agent.url</name> \
174  <value>localhost</value> \
175  <description>A URL to advertise in the User-Agent header. </description> \
[126]176</property>\
177<property>\
178  <name>http.agent.email</name>\
[131]179  <value>'$Admin_email'</value> \
180  <description>An email address \
181  </description> \
182</property>\
183<property>\
184  <name>plugin.folders</name>\
185  <value>'$NutchEZ_HOME'/plugins</value>\
186  <description>Directories where nutch plugins are located. </description>\
187</property>\
188<property>\
189  <name>plugin.includes</name>\
190  <value>protocol-(http|httpclient)|urlfilter-regex|parse-(text|html|js|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|swf|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>\
191  <description> Regular expression naming plugin directory names</description>\
192 </property>\
193 <property>\
194  <name>parse.plugin.file</name>\
195  <value>parse-plugins.xml</value>\
196  <description>The name of the file that defines the associations between\
197  content-types and parsers.</description>\
198 </property>\
199 <property>\
200   <name>db.max.outlinks.per.page</name>\
201   <value>-1</value>\
202   <description> </description>\
203 </property> \
204 <property>\
205   <name>http.content.limit</name> \
206   <value>-1</value>\
207 </property>\
208<property>\
209  <name>indexer.mergeFactor</name>\
210  <value>500</value>\
211  <description>The factor that determines the frequency of Lucene segment\
212  merges. This must not be less than 2, higher values increase indexing\
213  speed but lead to increased RAM usage, and increase the number of\
214  open file handles (which may lead to "Too many open files" errors).\
215  NOTE: the "segments" here have nothing to do with Nutch segments, they\
216  are a low-level data unit used by Lucene.\
[126]217  </description>\
218</property>\
[131]219
220<property>\
221  <name>indexer.minMergeDocs</name>\
222  <value>500</value>\
223  <description>This number determines the minimum number of Lucene\
224  Documents buffered in memory between Lucene segment merges. Larger\
225  values increase indexing speed and increase RAM usage.\
226  </description>\
227</property>\
228
229' $NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml
[117]230}
231
[136]232
233set_Nutch_conf () {
234  set_hadoop-env
235  set_haoop-site
236  set_nutch-site
237  set_crawl-urlfilter
238}
239
240
[131]241Install_Nutch () {
242  cd /opt
243  tar zxf /opt/nutch-1.0.tar.gz
244#  tar zxvf /opt/nutch-1.0.tar.gz
245  mv /opt/nutch-1.0  NutchEZ
246  chown -R nutchuser:nutchuser $NutchEZ_HOME
247  set_Nutch_conf
248}
[125]249
[131]250# install tomcat
251Install_Tomcat () {
252  cd /opt/
253#  tar zxvf apache-tomcat-6.0.18.tar.gz
254  tar zxf apache-tomcat-6.0.18.tar.gz
255  mv apache-tomcat-6.0.18 $NutchEZ_HOME
256  cd $NutchEZ_HOME
257  mv  apache-tomcat-6.0.18 tomcat
258  mkdir web
259  # mkdir $NutchEZ_HOME/search
260  chown -R nutchuser:nutchuser $NutchEZ_HOME
261  jar -xvf nutch-1.0.war web
262  mv $NutchEZ_HOME/tomcat/webapps/ROOT $NutchEZ_HOME/tomcat/webapps/ROOT-ori
263  mv $NutchEZ_HOME/web $NutchEZ_HOME/tomcat/webapps/ROOT
264  set_server
265  #set_nutch-site2
266}
267
[117]268start_up_tomcat () {
[125]269  echo "start up tomcat..."
270  $NutchEZ_HOME/tomcat/bin/startup.sh
[117]271}
Note: See TracBrowser for help on using the repository browser.