Changeset 131 for nutchez-0.2
- Timestamp:
- May 27, 2010, 5:21:07 PM (14 years ago)
- Location:
- nutchez-0.2/src/test
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
nutchez-0.2/src/test/install
r130 r131 24 24 # make_ssh_key 25 25 26 Install_Nutch 27 Install_Tomcat 28 26 Install_Nutch 29 27 # make_client_install 30 31 28 format_HDFS 32 29 start_up_NutchEZ -
nutchez-0.2/src/test/install_func.sh
r130 r131 38 38 confirm_install_information () { 39 39 read -p "Please confirm your install infomation: 1.Yes 2.No " confirm 40 } 41 42 set_Nutch_conf () { 43 set_hadoop-env 44 set_haoop-site 45 set_nutch-site 46 set_crawl-urlfilter 47 } 48 49 # set $NutchEZ_HOME/conf/hadoop-env.sh 50 set_hadoop-env () { 51 echo "set $NutchEZ_HOME/conf/hadoop-env.sh" 52 cd $NutchEZ_HOME/conf/ 53 cat >> hadoop-env.sh << EOF 54 export JAVA_HOME=/usr/lib/jvm/java-6-sun 55 export HADOOP_HOME=$NutchEZ_HOME 56 export HADOOP_LOG_DIR=/tmp/NutchEZ/logs 57 export HADOOP_SLAVES=$NutchEZ_HOME/conf/slaves 58 export HADOOP_CONF_DIR=$NutchEZ_HOME/conf 59 export HADOOP_PID_DIR=/tmp/hadoop/pid 60 export NUTCH_HOME=$NutchEZ_HOME 61 export NUTCH_CONF_DIR=$NutchEZ_HOME/conf 62 EOF 63 } 64 65 # set $NutchEZ_HOME/conf/hadoop-site.xml 66 set_haoop-site () { 67 echo "set $NutchEZ_HOME/conf/hadoop-site.xml" 68 cd $NutchEZ_HOME/conf/ 69 cat > hadoop-site.xml << EOF 70 <configuration> 71 <property> 72 <name>fs.default.name</name> 73 <value>$MasterDNS:9000</value> 74 <description> The name of the default file system. Either the literal string "local" or a host:port for NDFS. </description> 75 </property> 76 <property> 77 <name>mapred.job.tracker</name> 78 <value>$MasterDNS:9001</value> 79 <description> The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task. </description> 80 </property> 81 </configuration> 82 EOF 83 } 84 85 set_nutch-site () { 86 echo "set $NutchEZ_HOME/conf/nutch-site.xml" 87 cd $NutchEZ_HOME/conf/ 88 cat > nutch-site.xml << EOF 89 <configuration> 90 <property> 91 <name>http.agent.name</name> 92 <value>nutchuser</value> 93 <description>HTTP 'User-Agent' request header. </description> 94 </property> 95 <property> 96 <name>http.agent.description</name> 97 <value>MyTest</value> 98 <description>Further description</description> 99 </property> 100 <property> 101 <name>http.agent.url</name> 102 <value>$MasterDNS</value> 103 <description>A URL to advertise in the User-Agent header. </description> 104 </property> 105 <property> 106 <name>$MasterDNS</name> 107 <value>$Admin_email</value> 108 <description>An email address 109 </description> 110 </property> 111 </configuration> 112 EOF 113 } 114 115 116 set_crawl-urlfilter () { 117 echo "set $NutchEZ_HOME/conf/set_crawl-urlfilter.txt" 118 Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip file:, ftp:, & mailto: urls' | sed 's/:.*//g'` 119 sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt 120 sed -i ''$Line_NO'a -^(ftp|mailto):' $NutchEZ_HOME/conf/crawl-urlfilter.txt 121 122 123 Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip image and other suffixes we can' | sed 's/:.*//g'` 124 sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt 125 sed -i ''$Line_NO'a -\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$' $NutchEZ_HOME/conf/crawl-urlfilter.txt 126 127 128 Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip URLs containing certain characters as probable queries, etc.' | sed 's/:.*//g'` 129 sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt 130 sed -i ''$Line_NO'a -[*!@]' $NutchEZ_HOME/conf/crawl-urlfilter.txt 131 132 133 Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip everything else' | sed 's/:.*//g'` 134 sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt 135 sed -i ''$Line_NO'a +.*' $NutchEZ_HOME/conf/crawl-urlfilter.txt 136 sed -i ''$Line_NO'a # accecpt anything else' $NutchEZ_HOME/conf/crawl-urlfilter.txt 137 } 138 139 format_HDFS () { 140 echo "format HDFS..." 141 $NutchEZ_HOME/bin/hadoop namenode -format 142 } 143 144 start_up_NutchEZ (){ 145 echo "start up NutchEZ..." 146 $NutchEZ_HOME/bin/start-all.sh 147 } 148 149 set_server () { 150 echo "$NutchEZ_HOME/tomcat/conf/server.xml" 151 Line_NO=`cat $NutchEZ_HOME'/tomcat/conf/server.xml' | grep -n '<!-- A "Connector" using the shared thread pool-->' | sed 's/:.*//g'` 152 153 sed -i ''$((Line_NO+1))','$((Line_NO+6))'d' $NutchEZ_HOME/tomcat/conf/server.xml 154 sed -i ''$Line_NO'a <Connector port="8080" protocol="HTTP/1.1"\ 155 connectionTimeout="20000"\ 156 redirectPort="8443" URIEncoding="UTF-8"\ 157 useBodyEncodingForURI="true" />\ 158 ' $NutchEZ_HOME/tomcat/conf/server.xml 159 } 160 161 162 set_nutch-site2 () { 163 echo "$NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml" 164 165 # 搜尋加入設定的行號位址 166 line_NO=`cat $NutchEZ_HOME'/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml' | grep -n '<'configuration'>' | sed 's/:.*//g'` 167 168 # 加入設定檔 169 sed -i ''$line_NO'a <property>\ 170 <name>http.agent.name</name>\ 171 <value>nutch</value>\ 172 <description>HTTP 'User-Agent' request header. </description> \ 173 </property>\ 174 <property>\ 175 <name>http.agent.description</name>\ 176 <value>MyTest</value>\ 177 <description>Further description</description> \ 178 </property>\ 179 <property>\ 180 <name>http.agent.url</name> \ 181 <value>localhost</value> \ 182 <description>A URL to advertise in the User-Agent header. </description> \ 183 </property>\ 184 <property>\ 185 <name>http.agent.email</name>\ 186 <value>'$Admin_email'</value> \ 187 <description>An email address \ 188 </description> \ 189 </property>\ 190 <property>\ 191 <name>plugin.folders</name>\ 192 <value>'$NutchEZ_HOME'/plugins</value>\ 193 <description>Directories where nutch plugins are located. </description>\ 194 </property>\ 195 <property>\ 196 <name>plugin.includes</name>\ 197 <value>protocol-(http|httpclient)|urlfilter-regex|parse-(text|html|js|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|swf|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>\ 198 <description> Regular expression naming plugin directory names</description>\ 199 </property>\ 200 <property>\ 201 <name>parse.plugin.file</name>\ 202 <value>parse-plugins.xml</value>\ 203 <description>The name of the file that defines the associations between\ 204 content-types and parsers.</description>\ 205 </property>\ 206 <property>\ 207 <name>db.max.outlinks.per.page</name>\ 208 <value>-1</value>\ 209 <description> </description>\ 210 </property> \ 211 <property>\ 212 <name>http.content.limit</name> \ 213 <value>-1</value>\ 214 </property>\ 215 <property>\ 216 <name>indexer.mergeFactor</name>\ 217 <value>500</value>\ 218 <description>The factor that determines the frequency of Lucene segment\ 219 merges. This must not be less than 2, higher values increase indexing\ 220 speed but lead to increased RAM usage, and increase the number of\ 221 open file handles (which may lead to "Too many open files" errors).\ 222 NOTE: the "segments" here have nothing to do with Nutch segments, they\ 223 are a low-level data unit used by Lucene.\ 224 </description>\ 225 </property>\ 226 227 <property>\ 228 <name>indexer.minMergeDocs</name>\ 229 <value>500</value>\ 230 <description>This number determines the minimum number of Lucene\ 231 Documents buffered in memory between Lucene segment merges. Larger\ 232 values increase indexing speed and increase RAM usage.\ 233 </description>\ 234 </property>\ 235 236 ' $NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml 40 237 } 41 238 … … 49 246 } 50 247 51 set_Nutch_conf () {52 set_hadoop-env53 set_haoop-site54 set_nutch-site55 set_crawl-urlfilter56 }57 58 # set $NutchEZ_HOME/conf/hadoop-env.sh59 set_hadoop-env () {60 echo "set $NutchEZ_HOME/conf/hadoop-env.sh"61 cd $NutchEZ_HOME/conf/62 cat >> hadoop-env.sh << EOF63 export JAVA_HOME=/usr/lib/jvm/java-6-sun64 export HADOOP_HOME=/opt/NutchEZ65 export HADOOP_LOG_DIR=/tmp/NutchEZ/logs66 export HADOOP_SLAVES=/opt/NutchEZ/conf/slaves67 EOF68 }69 70 # set $NutchEZ_HOME/conf/hadoop-site.xml71 set_haoop-site () {72 echo "set $NutchEZ_HOME/conf/hadoop-site.xml"73 cd $NutchEZ_HOME/conf/74 cat > hadoop-site.xml << EOF75 <configuration>76 <property>77 <name>fs.default.name</name>78 <value>$MasterDNS:9000</value>79 <description> The name of the default file system. Either the literal string "local" or a host:port for NDFS. </description>80 </property>81 <property>82 <name>mapred.job.tracker</name>83 <value>$MasterDNS:9001</value>84 <description> The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task. </description>85 </property>86 </configuration>87 EOF88 }89 90 set_nutch-site () {91 echo "set $NutchEZ_HOME/conf/nutch-site.xml"92 cd $NutchEZ_HOME/conf/93 cat > nutch-site.xml << EOF94 <configuration>95 <property>96 <name>http.agent.name</name>97 <value>nutchuser</value>98 <description>HTTP 'User-Agent' request header. </description>99 </property>100 <property>101 <name>http.agent.description</name>102 <value>MyTest</value>103 <description>Further description</description>104 </property>105 <property>106 <name>http.agent.url</name>107 <value>$MasterDNS</value>108 <description>A URL to advertise in the User-Agent header. </description>109 </property>110 <property>111 <name>$MasterDNS</name>112 <value>$Admin_email</value>113 <description>An email address114 </description>115 </property>116 </configuration>117 EOF118 }119 120 121 set_crawl-urlfilter () {122 echo "set $NutchEZ_HOME/conf/set_crawl-urlfilter.txt"123 Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip file:, ftp:, & mailto: urls' | sed 's/:.*//g'`124 sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt125 sed -i ''$Line_NO'a -^(ftp|mailto):' $NutchEZ_HOME/conf/crawl-urlfilter.txt126 127 128 Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip image and other suffixes we can' | sed 's/:.*//g'`129 sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt130 sed -i ''$Line_NO'a -\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$' $NutchEZ_HOME/conf/crawl-urlfilter.txt131 132 133 Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip URLs containing certain characters as probable queries, etc.' | sed 's/:.*//g'`134 sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt135 sed -i ''$Line_NO'a -[*!@]' $NutchEZ_HOME/conf/crawl-urlfilter.txt136 137 138 Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip everything else' | sed 's/:.*//g'`139 sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt140 sed -i ''$Line_NO'a +.*' $NutchEZ_HOME/conf/crawl-urlfilter.txt141 sed -i ''$Line_NO'a # accecpt anything else' $NutchEZ_HOME/conf/crawl-urlfilter.txt142 }143 144 format_HDFS () {145 echo "format HDFS..."146 $NutchEZ_HOME/bin/hadoop namenode -format147 }148 149 start_up_NutchEZ (){150 echo "start up NutchEZ..."151 $NutchEZ_HOME/bin/start-all.sh152 }153 154 248 # install tomcat 155 249 Install_Tomcat () { 156 250 cd /opt/ 157 # tar zx f apache-tomcat-6.0.18.tar.gz251 # tar zxvf apache-tomcat-6.0.18.tar.gz 158 252 tar zxf apache-tomcat-6.0.18.tar.gz 159 253 mv apache-tomcat-6.0.18 $NutchEZ_HOME 160 254 cd $NutchEZ_HOME 161 255 mv apache-tomcat-6.0.18 tomcat 256 mkdir web 257 # mkdir $NutchEZ_HOME/search 162 258 chown -R nutchuser:nutchuser $NutchEZ_HOME 163 mkdir $NutchEZ_HOME/web 164 jar -xvf $NutchEZ_HOME/nutch-1.0.war $NutchEZ_HOME/web 259 jar -xvf nutch-1.0.war web 165 260 mv $NutchEZ_HOME/tomcat/webapps/ROOT $NutchEZ_HOME/tomcat/webapps/ROOT-ori 166 261 mv $NutchEZ_HOME/web $NutchEZ_HOME/tomcat/webapps/ROOT 167 mkdir $NutchEZ_HOME/search168 262 set_server 169 set_nutch-site 170 } 171 172 173 set_server () { 174 echo "$NutchEZ_HOME/tomcat/conf/server.xml" 175 Line_NO=`cat $NutchEZ_HOME'/tomcat/conf/server.xml' | grep -n '<!-- A "Connector" using the shared thread pool-->' | sed 's/:.*//g'` 176 177 sed -i ''$((Line_NO+1))','$((Line_NO+6))'d' $NutchEZ_HOME/tomcat/conf/server.xml 178 sed -i ''$Line_NO'a <Connector port="8080" protocol="HTTP/1.1"\ 179 connectionTimeout="20000"\ 180 redirectPort="8443" URIEncoding="UTF-8"\ 181 useBodyEncodingForURI="true" />\ 182 ' $NutchEZ_HOME/tomcat/conf/server.xml 183 } 184 185 set_nutch-site () { 186 echo "$NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml" 187 188 # 搜尋加入設定的行號位址 189 line_NO=`cat $NutchEZ_HOME'/conf/nutch-site.xml' | grep -n '<'configuration'>' | sed 's/:.*//g'` 190 191 # 加入設定檔 192 sed -i ''$line_NO'a <property>\ 193 <name>http.agent.name</name>\ 194 <value>waue</value>\ 195 <description>HTTP 'User-Agent' request header. </description>\ 196 </property>\ 197 <property>\ 198 <name>http.agent.description</name>\ 199 <value>MyTest</value>\ 200 <description>Further description</description>\ 201 </property>\ 202 <property>\ 203 <name>http.agent.url</name>\ 204 <value>'$MasterDNS'</value>\ 205 <description>A URL to advertise in the User-Agent header. </description>\ 206 </property>\ 207 <property>\ 208 <name>http.agent.email</name>\ 209 <value>'$Admin_email'</value>\ 210 <description>An email address\ 211 </description>\ 212 </property>\ 213 ' $NutchEZ_HOME/conf/nutch-site.xml 214 } 215 263 #set_nutch-site2 264 } 216 265 217 266 start_up_tomcat () {
Note: See TracChangeset
for help on using the changeset viewer.