Changeset 131 for nutchez-0.2


Ignore:
Timestamp:
May 27, 2010, 5:21:07 PM (14 years ago)
Author:
shunfa
Message:

modify install, install_func.sh

Location:
nutchez-0.2/src/test
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • nutchez-0.2/src/test/install

    r130 r131  
    2424    # make_ssh_key
    2525
    26     Install_Nutch
    27     Install_Tomcat
    28    
     26    Install_Nutch   
    2927    # make_client_install
    30    
    3128    format_HDFS
    3229    start_up_NutchEZ
  • nutchez-0.2/src/test/install_func.sh

    r130 r131  
    3838confirm_install_information () {
    3939  read -p "Please confirm your install infomation: 1.Yes 2.No  " confirm
     40}
     41
     42set_Nutch_conf () {
     43  set_hadoop-env
     44  set_haoop-site
     45  set_nutch-site
     46  set_crawl-urlfilter
     47}
     48
     49# set $NutchEZ_HOME/conf/hadoop-env.sh
     50set_hadoop-env () {
     51  echo "set $NutchEZ_HOME/conf/hadoop-env.sh"
     52  cd $NutchEZ_HOME/conf/
     53  cat >> hadoop-env.sh << EOF
     54export JAVA_HOME=/usr/lib/jvm/java-6-sun
     55export HADOOP_HOME=$NutchEZ_HOME
     56export HADOOP_LOG_DIR=/tmp/NutchEZ/logs
     57export HADOOP_SLAVES=$NutchEZ_HOME/conf/slaves
     58export HADOOP_CONF_DIR=$NutchEZ_HOME/conf
     59export HADOOP_PID_DIR=/tmp/hadoop/pid
     60export NUTCH_HOME=$NutchEZ_HOME
     61export NUTCH_CONF_DIR=$NutchEZ_HOME/conf
     62EOF
     63}
     64
     65# set $NutchEZ_HOME/conf/hadoop-site.xml
     66set_haoop-site () {
     67  echo "set $NutchEZ_HOME/conf/hadoop-site.xml"
     68  cd $NutchEZ_HOME/conf/
     69  cat > hadoop-site.xml << EOF
     70<configuration>
     71<property>
     72    <name>fs.default.name</name>
     73    <value>$MasterDNS:9000</value>
     74    <description> The name of the default file system. Either the literal string "local" or a host:port for NDFS. </description>
     75</property>
     76<property>
     77    <name>mapred.job.tracker</name>
     78    <value>$MasterDNS:9001</value>
     79    <description> The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task. </description>
     80</property>
     81</configuration>
     82EOF
     83}
     84
     85set_nutch-site () {
     86  echo "set $NutchEZ_HOME/conf/nutch-site.xml"
     87  cd $NutchEZ_HOME/conf/
     88  cat > nutch-site.xml << EOF
     89<configuration>
     90<property>
     91  <name>http.agent.name</name>
     92  <value>nutchuser</value>
     93  <description>HTTP 'User-Agent' request header. </description>
     94</property>
     95<property>
     96  <name>http.agent.description</name>
     97  <value>MyTest</value>
     98  <description>Further description</description>
     99</property>
     100<property>
     101  <name>http.agent.url</name>
     102  <value>$MasterDNS</value>
     103  <description>A URL to advertise in the User-Agent header. </description>
     104</property>
     105<property>
     106  <name>$MasterDNS</name>
     107  <value>$Admin_email</value>
     108  <description>An email address
     109  </description>
     110</property>
     111</configuration>
     112EOF
     113}
     114
     115
     116set_crawl-urlfilter () {
     117  echo "set $NutchEZ_HOME/conf/set_crawl-urlfilter.txt"
     118  Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip file:, ftp:, & mailto: urls' | sed 's/:.*//g'`
     119  sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
     120  sed -i ''$Line_NO'a -^(ftp|mailto):' $NutchEZ_HOME/conf/crawl-urlfilter.txt
     121
     122
     123  Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip image and other suffixes we can' | sed 's/:.*//g'`
     124  sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
     125  sed -i ''$Line_NO'a -\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$' $NutchEZ_HOME/conf/crawl-urlfilter.txt
     126
     127
     128  Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip URLs containing certain characters as probable queries, etc.' | sed 's/:.*//g'`
     129  sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
     130  sed -i ''$Line_NO'a -[*!@]' $NutchEZ_HOME/conf/crawl-urlfilter.txt
     131
     132
     133  Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip everything else' | sed 's/:.*//g'`
     134  sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
     135  sed -i ''$Line_NO'a +.*' $NutchEZ_HOME/conf/crawl-urlfilter.txt
     136  sed -i ''$Line_NO'a # accecpt anything else' $NutchEZ_HOME/conf/crawl-urlfilter.txt
     137}
     138
     139format_HDFS () {
     140  echo "format HDFS..."
     141  $NutchEZ_HOME/bin/hadoop namenode -format
     142}
     143
     144start_up_NutchEZ (){
     145  echo "start up NutchEZ..."
     146  $NutchEZ_HOME/bin/start-all.sh
     147}
     148
     149set_server () {
     150  echo "$NutchEZ_HOME/tomcat/conf/server.xml"
     151  Line_NO=`cat $NutchEZ_HOME'/tomcat/conf/server.xml' | grep -n '<!-- A "Connector" using the shared thread pool-->' | sed 's/:.*//g'`
     152
     153  sed -i ''$((Line_NO+1))','$((Line_NO+6))'d' $NutchEZ_HOME/tomcat/conf/server.xml
     154  sed -i ''$Line_NO'a    <Connector port="8080" protocol="HTTP/1.1"\
     155               connectionTimeout="20000"\
     156               redirectPort="8443" URIEncoding="UTF-8"\
     157               useBodyEncodingForURI="true" />\
     158' $NutchEZ_HOME/tomcat/conf/server.xml
     159}
     160
     161
     162set_nutch-site2 () {
     163  echo "$NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml"
     164 
     165  # 搜尋加入設定的行號位址
     166  line_NO=`cat $NutchEZ_HOME'/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml' | grep -n '<'configuration'>' | sed 's/:.*//g'`
     167 
     168  # 加入設定檔
     169  sed -i ''$line_NO'a  <property>\
     170  <name>http.agent.name</name>\
     171  <value>nutch</value>\
     172  <description>HTTP 'User-Agent' request header. </description> \
     173</property>\
     174<property>\
     175  <name>http.agent.description</name>\
     176  <value>MyTest</value>\
     177  <description>Further description</description> \
     178</property>\
     179<property>\
     180  <name>http.agent.url</name> \
     181  <value>localhost</value> \
     182  <description>A URL to advertise in the User-Agent header. </description> \
     183</property>\
     184<property>\
     185  <name>http.agent.email</name>\
     186  <value>'$Admin_email'</value> \
     187  <description>An email address \
     188  </description> \
     189</property>\
     190<property>\
     191  <name>plugin.folders</name>\
     192  <value>'$NutchEZ_HOME'/plugins</value>\
     193  <description>Directories where nutch plugins are located. </description>\
     194</property>\
     195<property>\
     196  <name>plugin.includes</name>\
     197  <value>protocol-(http|httpclient)|urlfilter-regex|parse-(text|html|js|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|swf|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>\
     198  <description> Regular expression naming plugin directory names</description>\
     199 </property>\
     200 <property>\
     201  <name>parse.plugin.file</name>\
     202  <value>parse-plugins.xml</value>\
     203  <description>The name of the file that defines the associations between\
     204  content-types and parsers.</description>\
     205 </property>\
     206 <property>\
     207   <name>db.max.outlinks.per.page</name>\
     208   <value>-1</value>\
     209   <description> </description>\
     210 </property> \
     211 <property>\
     212   <name>http.content.limit</name> \
     213   <value>-1</value>\
     214 </property>\
     215<property>\
     216  <name>indexer.mergeFactor</name>\
     217  <value>500</value>\
     218  <description>The factor that determines the frequency of Lucene segment\
     219  merges. This must not be less than 2, higher values increase indexing\
     220  speed but lead to increased RAM usage, and increase the number of\
     221  open file handles (which may lead to "Too many open files" errors).\
     222  NOTE: the "segments" here have nothing to do with Nutch segments, they\
     223  are a low-level data unit used by Lucene.\
     224  </description>\
     225</property>\
     226
     227<property>\
     228  <name>indexer.minMergeDocs</name>\
     229  <value>500</value>\
     230  <description>This number determines the minimum number of Lucene\
     231  Documents buffered in memory between Lucene segment merges. Larger\
     232  values increase indexing speed and increase RAM usage.\
     233  </description>\
     234</property>\
     235
     236' $NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml
    40237}
    41238
     
    49246}
    50247
    51 set_Nutch_conf () {
    52   set_hadoop-env
    53   set_haoop-site
    54   set_nutch-site
    55   set_crawl-urlfilter
    56 }
    57 
    58 # set $NutchEZ_HOME/conf/hadoop-env.sh
    59 set_hadoop-env () {
    60   echo "set $NutchEZ_HOME/conf/hadoop-env.sh"
    61   cd $NutchEZ_HOME/conf/
    62   cat >> hadoop-env.sh << EOF
    63 export JAVA_HOME=/usr/lib/jvm/java-6-sun
    64 export HADOOP_HOME=/opt/NutchEZ
    65 export HADOOP_LOG_DIR=/tmp/NutchEZ/logs
    66 export HADOOP_SLAVES=/opt/NutchEZ/conf/slaves
    67 EOF
    68 }
    69 
    70 # set $NutchEZ_HOME/conf/hadoop-site.xml
    71 set_haoop-site () {
    72   echo "set $NutchEZ_HOME/conf/hadoop-site.xml"
    73   cd $NutchEZ_HOME/conf/
    74   cat > hadoop-site.xml << EOF
    75 <configuration>
    76 <property>
    77     <name>fs.default.name</name>
    78     <value>$MasterDNS:9000</value>
    79     <description> The name of the default file system. Either the literal string "local" or a host:port for NDFS. </description>
    80 </property>
    81 <property>
    82     <name>mapred.job.tracker</name>
    83     <value>$MasterDNS:9001</value>
    84     <description> The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task. </description>
    85 </property>
    86 </configuration>
    87 EOF
    88 }
    89 
    90 set_nutch-site () {
    91   echo "set $NutchEZ_HOME/conf/nutch-site.xml"
    92   cd $NutchEZ_HOME/conf/
    93   cat > nutch-site.xml << EOF
    94 <configuration>
    95 <property>
    96   <name>http.agent.name</name>
    97   <value>nutchuser</value>
    98   <description>HTTP 'User-Agent' request header. </description>
    99 </property>
    100 <property>
    101   <name>http.agent.description</name>
    102   <value>MyTest</value>
    103   <description>Further description</description>
    104 </property>
    105 <property>
    106   <name>http.agent.url</name>
    107   <value>$MasterDNS</value>
    108   <description>A URL to advertise in the User-Agent header. </description>
    109 </property>
    110 <property>
    111   <name>$MasterDNS</name>
    112   <value>$Admin_email</value>
    113   <description>An email address
    114   </description>
    115 </property>
    116 </configuration>
    117 EOF
    118 }
    119 
    120 
    121 set_crawl-urlfilter () {
    122   echo "set $NutchEZ_HOME/conf/set_crawl-urlfilter.txt"
    123   Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip file:, ftp:, & mailto: urls' | sed 's/:.*//g'`
    124   sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
    125   sed -i ''$Line_NO'a -^(ftp|mailto):' $NutchEZ_HOME/conf/crawl-urlfilter.txt
    126 
    127 
    128   Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip image and other suffixes we can' | sed 's/:.*//g'`
    129   sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
    130   sed -i ''$Line_NO'a -\\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$' $NutchEZ_HOME/conf/crawl-urlfilter.txt
    131 
    132 
    133   Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip URLs containing certain characters as probable queries, etc.' | sed 's/:.*//g'`
    134   sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
    135   sed -i ''$Line_NO'a -[*!@]' $NutchEZ_HOME/conf/crawl-urlfilter.txt
    136 
    137 
    138   Line_NO=`cat $NutchEZ_HOME'/conf/crawl-urlfilter.txt' | grep -n 'skip everything else' | sed 's/:.*//g'`
    139   sed -i ''$((Line_NO+1))'d' $NutchEZ_HOME/conf/crawl-urlfilter.txt
    140   sed -i ''$Line_NO'a +.*' $NutchEZ_HOME/conf/crawl-urlfilter.txt
    141   sed -i ''$Line_NO'a # accecpt anything else' $NutchEZ_HOME/conf/crawl-urlfilter.txt
    142 }
    143 
    144 format_HDFS () {
    145   echo "format HDFS..."
    146   $NutchEZ_HOME/bin/hadoop namenode -format
    147 }
    148 
    149 start_up_NutchEZ (){
    150   echo "start up NutchEZ..."
    151   $NutchEZ_HOME/bin/start-all.sh
    152 }
    153 
    154248# install tomcat
    155249Install_Tomcat () {
    156250  cd /opt/
    157 #  tar zxf apache-tomcat-6.0.18.tar.gz
     251#  tar zxvf apache-tomcat-6.0.18.tar.gz
    158252  tar zxf apache-tomcat-6.0.18.tar.gz
    159253  mv apache-tomcat-6.0.18 $NutchEZ_HOME
    160254  cd $NutchEZ_HOME
    161255  mv  apache-tomcat-6.0.18 tomcat
     256  mkdir web
     257  # mkdir $NutchEZ_HOME/search
    162258  chown -R nutchuser:nutchuser $NutchEZ_HOME
    163   mkdir $NutchEZ_HOME/web
    164   jar -xvf $NutchEZ_HOME/nutch-1.0.war $NutchEZ_HOME/web
     259  jar -xvf nutch-1.0.war web
    165260  mv $NutchEZ_HOME/tomcat/webapps/ROOT $NutchEZ_HOME/tomcat/webapps/ROOT-ori
    166261  mv $NutchEZ_HOME/web $NutchEZ_HOME/tomcat/webapps/ROOT
    167   mkdir $NutchEZ_HOME/search
    168262  set_server
    169   set_nutch-site
    170 }
    171 
    172 
    173 set_server () {
    174   echo "$NutchEZ_HOME/tomcat/conf/server.xml"
    175   Line_NO=`cat $NutchEZ_HOME'/tomcat/conf/server.xml' | grep -n '<!-- A "Connector" using the shared thread pool-->' | sed 's/:.*//g'`
    176 
    177   sed -i ''$((Line_NO+1))','$((Line_NO+6))'d' $NutchEZ_HOME/tomcat/conf/server.xml
    178   sed -i ''$Line_NO'a    <Connector port="8080" protocol="HTTP/1.1"\
    179                connectionTimeout="20000"\
    180                redirectPort="8443" URIEncoding="UTF-8"\
    181                useBodyEncodingForURI="true" />\
    182 ' $NutchEZ_HOME/tomcat/conf/server.xml
    183 }
    184 
    185 set_nutch-site () {
    186   echo "$NutchEZ_HOME/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml"
    187  
    188   # 搜尋加入設定的行號位址
    189   line_NO=`cat $NutchEZ_HOME'/conf/nutch-site.xml' | grep -n '<'configuration'>' | sed 's/:.*//g'`
    190  
    191   # 加入設定檔
    192   sed -i ''$line_NO'a <property>\
    193   <name>http.agent.name</name>\
    194   <value>waue</value>\
    195   <description>HTTP 'User-Agent' request header. </description>\
    196 </property>\
    197 <property>\
    198   <name>http.agent.description</name>\
    199   <value>MyTest</value>\
    200   <description>Further description</description>\
    201 </property>\
    202 <property>\
    203   <name>http.agent.url</name>\
    204   <value>'$MasterDNS'</value>\
    205   <description>A URL to advertise in the User-Agent header. </description>\
    206 </property>\
    207 <property>\
    208   <name>http.agent.email</name>\
    209   <value>'$Admin_email'</value>\
    210   <description>An email address\
    211   </description>\
    212 </property>\
    213 ' $NutchEZ_HOME/conf/nutch-site.xml
    214 }
    215 
     263  #set_nutch-site2
     264}
    216265
    217266start_up_tomcat () {
Note: See TracChangeset for help on using the changeset viewer.