Context Navigation

0409

Timestamp:: Apr 9, 2009, 6:29:08 PM (16 years ago)
Author:: waue
Comment:: --

Legend:

: Unmodified
: Added
: Removed
: Modified

waue/2009/0409

-                      v3
+                      v4
 #!sh
 export JAVA_HOME=/usr/lib/jvm/java-6-sun
 export HADOOP_HOME=/opt/nutch
 export HADOOP_CONF_DIR=/opt/nutch_conf
 export HADOOP_SLAVES=$HADOOP_CONF_DIR/slaves
+export HADOOP_LOG_DIR=/tmp/nutch/logs
+export HADOOP_PID_DIR=/tmp/nutch/pid
+export HADOOP_LOG_DIR=/tmp/hadoop/logs
+export HADOOP_PID_DIR=/tmp/hadoop/pid
 export NUTCH_HOME=/opt/nutch
 export NUTCH_CONF_DIR=/opt/nutch_conf
 }}}
 …
 #!sh
 <configuration>
+<property>
+  <property>
     <name>fs.default.name</name>
+    <value>gm1.nchc.org.tw:9000</value>
+    <description> The name of the default file system. Either the literal string "local" or a host:port for NDFS. </description>
+</property>
+<property>
+    <value>hdfs://node1:9000/</value>
+    <description> </description>
+  </property>
+  <property>
     <name>mapred.job.tracker</name>
+    <value>gm1.nchc.org.tw:9001</value>
+    <description> The host and port that the MapReduce job tracker runs at. If "local", then jobs are run in-process as a single map and reduce task. </description>
+</property>
+    <value>node1:9001</value>
+    <description>  </description>
+  </property>
+  <property>
+    <name>hadoop.tmp.dir</name>
+    <value>/tmp/hadoop/hadoop-${user.name}</value>
+    <description> </description>
+  </property>
 </configuration>
 }}}
 === 3.3 nutch-site.xml ===
+ * 重要的設定檔，新增了必要的內容於內，然而想要了解更多參數資訊，請見nutch-default.xml
 {{{
 #!sh
 …
 <property>
   <name>http.agent.name</name>
   <value>waue</value>
+  <value>nutch</value>
   <description>HTTP 'User-Agent' request header. </description>
 </property>
 <property>
   <name>http.agent.description</name>
   <value>MyTest</value>
+  <value>nutch-crawl</value>
   <description>Further description</description>
 </property>
 <property>
   <name>http.agent.url</name>
   <value>gm1.nchc.org.tw</value>
+  <value>node1</value>
   <description>A URL to advertise in the User-Agent header. </description>
 </property>
 <property>
   <name>http.agent.email</name>
   <value>waue@nchc.org.tw</value>
+  <value>user@nchc.org.tw</value>
   <description>An email address
   </description>
 </property>
+<property>
+  <name>plugin.folders</name>
+  <value>/opt/nutch/plugins</value>
+  <description>Directories where nutch plugins are located. </description>
+</property>
+<property>
+  <name>plugin.includes</name>
+  <value>protocol-(http|httpclient)|urlfilter-regex|parse-(text|html|js|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|swf|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
+  <description> Regular expression naming plugin directory names</description>
+ </property>
+ <property>
+  <name>parse.plugin.file</name>
+  <value>parse-plugins.xml</value>
+  <description>The name of the file that defines the associations between
+  content-types and parsers.</description>
+ </property>
+ <property>
+   <name>db.max.outlinks.per.page</name>
+   <value>-1</value>
+   <description> </description>
+ </property>
+ <property>
+   <name>http.content.limit</name>
+   <value>-1</value>
+ </property>
+<property>
+<property>
+  <name>indexer.mergeFactor</name>
+  <value>500</value>
+  <description>The factor that determines the frequency of Lucene segment
+  merges. </description>
+</property>
+<property>
+  <name>indexer.minMergeDocs</name>
+  <value>500</value>
+  <description>This number determines the minimum number of Lucene. </description>
+</property>
 </configuration>
 }}}
 === 3.4 slaves ===
+其實不用改，因為原本就是localhost
+{{{
+#!sh
+localhost
+ * 這個檔不用設定，因為依照hadoop的叢集環境，下面列出我們環境所設定的
+{{{
+#!sh
+node1
+node2
 }}}
 === 3.5 crawl-urlfilter.txt ===
+將此檔的兩行改為下面內容
+{{{
+#!sh
+ * 重新編輯爬檔規則，此檔重要在於若設定不好，則爬出來的結果幾乎是空的，也就是說最後你的搜尋引擎都找不到資料啦！
+{{{
+#!sh
+# skip  ftp:, & mailto: urls
+-^(ftp|mailto):
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$
 # skip URLs containing certain characters as probable queries, etc.
 -[*!@]
+# accept hosts in MY.DOMAIN.NAME
++^http://([a-z0-9]*\.)*.*/
+}}}
+== step 4 執行 ==
+# accecpt anything else
++.*
+}}}
+=== 3.6 regex-urlfilter.txt ===
+ * 雖然官方網站鮮少介紹到此檔，但是crawl-urlfilter.txt用來設定爬intranet的規則，而regex-urlfilter.txt則是用來設定爬internet的規則
+{{{
+$ cd /opt/nutch_conf
+$ cp regex-urlfilter.txt regex-urlfilter.txt-bek
+$ cp crawl-urlfilter.txt regex-urlfilter.txt
+}}}
+== step 4 執行nutch ==
+ * 在此假設你已經把hadoop 啟動並且正在運作了。因此nutch是利用這個已經在運做的平台上
+ * 如果你的hadoop還沒啟動，則請在master節點(此篇以node1當作master)下 bin/start-all.sh指令；如果你的環境很clean，則請在master節點下
+   * 到/opt/nutch 或 /opt/hadoop皆可
+{{{
+$ cd /opt/nutch
+$ bin/hadoop namenode -format
+$ bin/start-all.sh
+}}}
 === 4.1 編輯url清單 ===
 {{{
 …
 {{{
 #!sh
+http://lucene.apache.org
+}}}
+=== 4.2 開啟HDFS ===
+{{{
+$ bin/hadoop namenode -format
+$ bin/start-all.sh
+}}}
+=== 4.3 上傳清單到HDFS ===
+http://www.nchc.org.tw
+}}}
+=== 4.2 上傳清單到HDFS ===
 {{{
 $ bin/hadoop -put urls urls
 }}}
+=== 4.4 執行nutch crawl ===
+{{{
+$ bin/nutch crawl urls -dir crawl01 -depth 3
+}}}
+== step 5 web瀏覽 ==
+=== 4.3 執行nutch crawl ===
+ * 用下面的指令就可以命令nutch開始工作了，之後map reduce會瘋狂工作
+{{{
+$ bin/nutch crawl urls -dir search -threads 2 -depth 3 -topN 100000
+}}}
+   * 執行上個指令會把執行過程秀在stdout上。若想要以後慢慢看這些訊息，可以用io導向的方式傾倒於日誌檔
+{{{
+$ bin/nutch crawl urls -dir search -threads 2 -depth 3 -topN 100000 >& nutch.log
+}}}
+ * 在nutch運作的同時，可以在node1節點用瀏覽器，透過 [http://localhost:50030 job管理頁面],[http://localhost:50070 hdfs管理頁面],[http://localhost:50060 程序運作頁面] 來監看程序。
+== step 5 瀏覽搜尋結果 ==
+ * nutch 在 step 4 的工作是把你寫在urls.txt檔內的網址，用map reduce的程序來進行資料分析，但是分析完之後，要透過tomcat來觀看結果。以下就是安裝與設定你的客製化搜尋引擎的步驟。
 === 5.1 安裝tomcat ===
  * 下載
+ * 下載tomcat
 {{{
 $ cd /opt/
 …
 }}}
+=== 5.2 將crawl結果匯入tomcat ===
+=== 5.1 tomcat server設定 ===
+ * 修改 /opt/tomcat/conf/server.xml 以修正中文亂碼問題
+{{{
+#!sh
+ <Connector port="8080" protocol="HTTP/1.1"
+               connectionTimeout="20000"
+               redirectPort="8443" URIEncoding="UTF-8"
+               useBodyEncodingForURI="true" />
+}}}
+=== 5.3 下載crawl結果===
+ * 先把放在hdfs上，nutch的運算結果下載到local端
+{{{
+$ cd /opt/nutch
+$ bin/hadoop dfs -get search /opt/search
+}}}
+=== 5.4 設定nutch的搜尋引擎頁面到tomcat ===
+ * 把nutch的搜尋引擎頁面取代為tomcat的webapps/ROOT
 {{{
 $ cd /opt/nutch
 …
 $ cd /opt/nutch
 $ mv /opt/nutch/web /opt/tomcat/webapps/ROOT
+}}}
+=== 5.5  設定搜尋引擎內容的來源路徑 ===
+ * 5.4的步驟雖然設定好搜尋引擎的頁面，然而其只能當作是介面而已，因此這個步驟把要搜尋的內容與搜尋介面做個連結
+{{{
 $ vim /opt/tomcat/webapps/ROOT/WEB-INF/classes/nutch-site.xml
 }}}
 …
 }}}
+並且修改 /opt/tomcat/conf/server.xml 以修正中文問題
+{{{
+#!sh
+    <Connector port="8080" protocol="HTTP/1.1"
+               connectionTimeout="20000"
+               redirectPort="8443" URIEncoding="UTF-8"/>
+}}}
+=== 5.3 瀏覽crawl結果 ===
+=== 5.6 啟動tomcat ===
 {{{
 $ /opt/tomcat/bin/startup.sh
 }}}
+[http://gm1.nchc.org.tw:8080]
+== step 6 享受結果 ==
+Enjoy ! [http://localhost:8080]