Changes between Version 4 and Version 5 of waue/2009/0406


Ignore:
Timestamp:
Apr 6, 2009, 5:56:44 PM (15 years ago)
Author:
waue
Comment:

--

Legend:

Unmodified
Added
Removed
Modified
  • waue/2009/0406

    v4 v5  
    2929 * 所有的設定檔都在 $NUTCH_HOME/conf 下
    3030=== 3.1 hadoop-env.sh ===
     31將原本的檔案hadoop-env.sh任意處插入
     32{{{
     33#!sh
     34export JAVA_HOME=/usr/lib/jvm/java-6-sun
     35export HADOOP_HOME=/opt/nutch
     36export HADOOP_LOG_DIR=/tmp/nutch/logs
     37export HADOOP_SLAVES=/opt/nutch/conf/slaves
     38}}}
    3139=== 3.2 hadoop-site.xml ===
     40{{{
     41#!sh
     42<configuration>
     43<property>
     44    <name>fs.default.name</name>
     45    <value>gm1.nchc.org.tw:9000</value>
     46    <description> The name of the default file system. Either the literal string "local" or a host:port for NDFS. </description>
     47</property>
     48<property>
     49    <name>mapred.job.tracker</name>
     50    <value>gm1.nchc.org.tw:9001</value>
     51    <description> The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task. </description>
     52</property>
     53</configuration>
     54}}}
    3255=== 3.3 nutch-site.xml ===
     56{{{
     57#!sh
     58<configuration>
     59<property>
     60  <name>http.agent.name</name>
     61  <value>waue</value>
     62  <description>HTTP 'User-Agent' request header. </description>
     63</property>
     64<property>
     65  <name>http.agent.description</name>
     66  <value>MyTest</value>
     67  <description>Further description</description>
     68</property>
     69<property>
     70  <name>http.agent.url</name>
     71  <value>gm1.nchc.org.tw</value>
     72  <description>A URL to advertise in the User-Agent header. </description>
     73</property>
     74<property>
     75  <name>http.agent.email</name>
     76  <value>waue@nchc.org.tw</value>
     77  <description>An email address
     78  </description>
     79</property>
     80</configuration>
     81}}}
    3382=== 3.4 slaves ===
     83其實不用改,因為原本就是localhost
     84{{{
     85#!sh
     86localhost
     87}}}
    3488=== 3.5 crawl-urlfilter.txt ===
     89將此檔的兩行改為下面內容
     90{{{
     91#!sh
     92
     93# skip URLs containing certain characters as probable queries, etc.
     94
     95-[*!@]
     96
     97
     98
     99
     100# accept hosts in MY.DOMAIN.NAME
     101
     102+^http://([a-z0-9]*\.)*.*/
     103
     104}}}
    35105
    36106
    37107== step 4 執行 ==
    38108=== 4.1 編輯url清單 ===
     109{{{
     110$ mkdir urls
     111$ vim urls.txt
     112}}}
     113
     114{{{
     115#!sh
     116http://lucene.apache.org
     117}}}
    39118
    40119=== 4.2 開啟HDFS ===
    41 
     120{{{
     121$ bin/hadoop namenode -format
     122$ bin/start-all.sh
     123}}}
    42124=== 4.3 上傳清單到HDFS ===
    43 
     125{{{
     126$ bin/hadoop -put urls urls
     127}}}
    44128=== 4.4 執行nutch crawl ===
    45 
     129{{{
     130$ bin/nutch crawl urls -dir crawl01 -depth 3
     131}}}
    46132== step 5 web瀏覽 ==
    47133