{{{ #!html
Nutch 1.3
}}} [[PageOutline]] = [intro] = * 7 June 2011 - Apache Nutch 1.3 Released * ref : * [http://wiki.apache.org/nutch/NutchTutorial] * [http://wiki.apache.org/nutch/RunningNutchAndSolr] = [setup] = == get == [http://ftp.twaren.net/Unix/Web/apache/nutch/ get nutch] * extract to /opt/nutch-1.3 {{{ cd /opt/nutch-1.3 ant }}} == deploy == 可將 bin/nutch 與 nutch-1.3.job 放到 hadoop 與之整合 == local == {{{ cd /opt/nutch-1.3/runtime/local }}} * bin/nutch (inject) {{{ #!text export JAVA_HOME="/usr/lib/jvm/java-6-sun" }}} * conf/nutch-site.xml (inject) {{{ #!text http.agent.name waue_test plugin.includes protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic) http.robots.agents nutch http.agent.url waue_test http.agent.email waue_test http.agent.version waue_test }}} * conf/regex-urlfilter.txt (replace) (1.2 conf/crawl-urlfilter.txt) {{{ #!text -^(file|ftp|mailto): # skip image and other suffixes we can't yet parse -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$ # skip URLs containing certain characters as probable queries, etc. -[*!] # skip URLs with slash-delimited segment that repeats 3+ times, to break loops #-.*(/[^/]+)/[^/]+\1/[^/]+\1/ # accept anything else +. }}} == [setup solr] == * [http://ftp.twaren.net/Unix/Web/apache/lucene/solr/3.3.0/ get solr] * extract to /opt/solr-3.3.0/ {{{ cd /opt/solr-3.3.0/ cp /opt/nutch-1.3/conf/schema.xml /opt/solr-3.3.0/example/solr/conf/ cd /opt/solr-3.3.0/example/ java -jar start.jar }}} = [execute] = {{{ mkdir urls ; echo "http://lucene.apache.org/nutch/" >urls/url.txt bin/nutch crawl urls -dir crawl2 -depth 2 -topN 50 }}} * you will get only 3 directories. {{{ #!text crawldb linkdb segments }}} * finally , connect nutch result to solr {{{ bin/nutch solrindex http://127.0.0.1:8983/solr/ crawl/crawldb crawl/linkdb crawl/segments/* }}} * using web admin to check http://localhost:8983/solr/admin/