source: nutchez-0.2/src/test/install_func.sh @ 131

Last change on this file since 131 was 131, checked in by shunfa, 14 years ago

modify install, install_func.sh

  • Property svn:executable set to *
File size: 8.4 KB
Line 
1#!/bin/bash
2source install_lang
3####### garbage here #############
4function mainFunction ( )
5{
6echo "$Good"
7}
8function braBraBra ( )
9{
10echo "$Bra_Bra_Bra"
11}
12####### garbage end ###############
13
14
15
16####### fafa code here ###########
17
18# 參數假設
19# /home/nutchuser/NutchEZ_source下有3個檔案
20# install.sh, nutch-1.0.tar.gz, apache-tomcat-6.0.18.tar.gz
21# 安裝路徑為/opt/NutchEZ
22
23Install_source=/home/nutchuser/NutchEZ_source
24NutchEZ_HOME=/opt/NutchEZ
25MasterIP_Address=`/sbin/ifconfig eth0 | grep 'inet addr' |  sed 's/^.*addr://g' | sed 's/Bcast.*$//g' | sed 's/ .*// '`
26
27
28set_install_information () {
29  read -p "Please enter administrator's e-mail address:  " Admin_email
30  read -p "Please enter the Master DNS:  " MasterDNS
31}
32
33show_info () {
34  echo "Administrator's e-mail address is $Admin_email."
35  echo "The master DNS is: $MasterDNS"
36}
37
38confirm_install_information () {
39  read -p "Please confirm your install infomation: 1.Yes 2.No  " confirm
40}
41
42set_Nutch_conf () {
43  set_hadoop-env
44  set_haoop-site
45  set_nutch-site
46  set_crawl-urlfilter
47}
48
49# set $NutchEZ_HOME/conf/hadoop-env.sh
50set_hadoop-env () {
51  echo "set $NutchEZ_HOME/conf/hadoop-env.sh"
52  cd $NutchEZ_HOME/conf/
53  cat >> hadoop-env.sh << EOF
54export JAVA_HOME=/usr/lib/jvm/java-6-sun
55export HADOOP_HOME=$NutchEZ_HOME
56export HADOOP_LOG_DIR=/tmp/NutchEZ/logs
57export HADOOP_SLAVES=$NutchEZ_HOME/conf/slaves
58export HADOOP_CONF_DIR=$NutchEZ_HOME/conf
59export HADOOP_PID_DIR=/tmp/hadoop/pid
60export NUTCH_HOME=$NutchEZ_HOME
61export NUTCH_CONF_DIR=$NutchEZ_HOME/conf
62EOF
63}
64
65# set $NutchEZ_HOME/conf/hadoop-site.xml
66set_haoop-site () {
67  echo "set $NutchEZ_HOME/conf/hadoop-site.xml"
68  cd $NutchEZ_HOME/conf/
69  cat > hadoop-site.xml << EOF
70<configuration>
71<property>
72    <name>fs.default.name</name>
73    <value>$MasterDNS:9000</value>
74    <description> The name of the default file system. Either the literal string "local" or a host:port for NDFS. </description>
75</property>
76<property>
77    <name>mapred.job.tracker</name>
78    <value>$MasterDNS:9001</value>
79    <description> The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task. </description>
80</property>
81</configuration>
82EOF
83}
84
85set_nutch-site () {
86  echo "set $NutchEZ_HOME/conf/nutch-site.xml"
87  cd $NutchEZ_HOME/conf/
88  cat > nutch-site.xml << EOF
89<configuration>
90<property>
91  <name>http.agent.name</name>
92  <value>nutchuser</value>
93  <description>HTTP 'User-Agent' request header. </description>
94</property>
95<property>
96  <name>http.agent.description</name>
97  <value>MyTest</value>
98  <description>Further description</description>
99</property>
100<property>
101  <name>http.agent.url</name>
102  <value>$MasterDNS</value>
103  <description>A URL to advertise in the User-Agent header. </description>
104</property>
105<property>
106  <name>$MasterDNS</name>
107  <value>$Admin_email</value>
108  <description>An email address
109  </description>
110</property>
111</configuration>
112EOF
113}
114
115
116set_crawl-urlfilter () {
117  echo "set $NutchEZ_HOME/conf/set_crawl-urlfilter.txt"
118  Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip file:, ftp:, & mailto: urls' | sed 's/:.*//g'`
119  sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
120  sed -i ''$Line_NO'a -^(ftp|mailto):' $NutchEZ_HOME/conf/crawl-urlfilter.txt
121
122
123  Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip image and other suffixes we can' | sed 's/:.*//g'`
124  sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
125  sed -i ''$Line_NO'a -\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$' $NutchEZ_HOME/conf/crawl-urlfilter.txt
126
127
128  Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip URLs containing certain characters as probable queries, etc.' | sed 's/:.*//g'`
129  sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
130  sed -i ''$Line_NO'a -[*!@]' $NutchEZ_HOME/conf/crawl-urlfilter.txt
131
132
133  Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip everything else' | sed 's/:.*//g'`
134  sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
135  sed -i ''$Line_NO'a +.*' $NutchEZ_HOME/conf/crawl-urlfilter.txt
136  sed -i ''$Line_NO'a # accecpt anything else' $NutchEZ_HOME/conf/crawl-urlfilter.txt
137}
138
139format_HDFS () {
140  echo "format HDFS..."
141  $NutchEZ_HOME/bin/hadoop namenode -format
142}
143
144start_up_NutchEZ (){
145  echo "start up NutchEZ..."
146  $NutchEZ_HOME/bin/start-all.sh
147}
148
149set_server () {
150  echo "$NutchEZ_HOME/tomcat/conf/server.xml"
151  Line_NO=`cat $NutchEZ_HOME'/tomcat/conf/server.xml' | grep -n '<!-- A "Connector" using the shared thread pool-->' | sed 's/:.*//g'`
152
153  sed -i ''$((Line_NO+1))','$((Line_NO+6))'d' $NutchEZ_HOME/tomcat/conf/server.xml
154  sed -i ''$Line_NO'a    <Connector port="8080" protocol="HTTP/1.1"\
155               connectionTimeout="20000"\
156               redirectPort="8443" URIEncoding="UTF-8"\
157               useBodyEncodingForURI="true" />\
158' $NutchEZ_HOME/tomcat/conf/server.xml
159}
160
161
162set_nutch-site2 () {
163  echo "$NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml"
164 
165  # 搜尋加入設定的行號位址
166  line_NO=`cat $NutchEZ_HOME'/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml' | grep -n '<'configuration'>' | sed 's/:.*//g'`
167 
168  # 加入設定檔
169  sed -i ''$line_NO'a  <property>\
170  <name>http.agent.name</name>\
171  <value>nutch</value>\
172  <description>HTTP 'User-Agent' request header. </description> \
173</property>\
174<property>\
175  <name>http.agent.description</name>\
176  <value>MyTest</value>\
177  <description>Further description</description> \
178</property>\
179<property>\
180  <name>http.agent.url</name> \
181  <value>localhost</value> \
182  <description>A URL to advertise in the User-Agent header. </description> \
183</property>\
184<property>\
185  <name>http.agent.email</name>\
186  <value>'$Admin_email'</value> \
187  <description>An email address \
188  </description> \
189</property>\
190<property>\
191  <name>plugin.folders</name>\
192  <value>'$NutchEZ_HOME'/plugins</value>\
193  <description>Directories where nutch plugins are located. </description>\
194</property>\
195<property>\
196  <name>plugin.includes</name>\
197  <value>protocol-(http|httpclient)|urlfilter-regex|parse-(text|html|js|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|swf|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>\
198  <description> Regular expression naming plugin directory names</description>\
199 </property>\
200 <property>\
201  <name>parse.plugin.file</name>\
202  <value>parse-plugins.xml</value>\
203  <description>The name of the file that defines the associations between\
204  content-types and parsers.</description>\
205 </property>\
206 <property>\
207   <name>db.max.outlinks.per.page</name>\
208   <value>-1</value>\
209   <description> </description>\
210 </property> \
211 <property>\
212   <name>http.content.limit</name> \
213   <value>-1</value>\
214 </property>\
215<property>\
216  <name>indexer.mergeFactor</name>\
217  <value>500</value>\
218  <description>The factor that determines the frequency of Lucene segment\
219  merges. This must not be less than 2, higher values increase indexing\
220  speed but lead to increased RAM usage, and increase the number of\
221  open file handles (which may lead to "Too many open files" errors).\
222  NOTE: the "segments" here have nothing to do with Nutch segments, they\
223  are a low-level data unit used by Lucene.\
224  </description>\
225</property>\
226
227<property>\
228  <name>indexer.minMergeDocs</name>\
229  <value>500</value>\
230  <description>This number determines the minimum number of Lucene\
231  Documents buffered in memory between Lucene segment merges. Larger\
232  values increase indexing speed and increase RAM usage.\
233  </description>\
234</property>\
235
236' $NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml
237}
238
239Install_Nutch () {
240  cd /opt
241  tar zxf /opt/nutch-1.0.tar.gz
242#  tar zxvf /opt/nutch-1.0.tar.gz
243  mv /opt/nutch-1.0  NutchEZ
244  chown -R nutchuser:nutchuser $NutchEZ_HOME
245  set_Nutch_conf
246}
247
248# install tomcat
249Install_Tomcat () {
250  cd /opt/
251#  tar zxvf apache-tomcat-6.0.18.tar.gz
252  tar zxf apache-tomcat-6.0.18.tar.gz
253  mv apache-tomcat-6.0.18 $NutchEZ_HOME
254  cd $NutchEZ_HOME
255  mv  apache-tomcat-6.0.18 tomcat
256  mkdir web
257  # mkdir $NutchEZ_HOME/search
258  chown -R nutchuser:nutchuser $NutchEZ_HOME
259  jar -xvf nutch-1.0.war web
260  mv $NutchEZ_HOME/tomcat/webapps/ROOT $NutchEZ_HOME/tomcat/webapps/ROOT-ori
261  mv $NutchEZ_HOME/web $NutchEZ_HOME/tomcat/webapps/ROOT
262  set_server
263  #set_nutch-site2
264}
265
266start_up_tomcat () {
267  echo "start up tomcat..."
268  $NutchEZ_HOME/tomcat/bin/startup.sh
269}
Note: See TracBrowser for help on using the repository browser.