{{{
#!html
<div style="text-align: center; color:#151B8D"><big style="font-weight: bold;"><big><big>
Nutch 1.3
</big></big></big></div> <div style="text-align: center; color:#7E2217"><big style="font-weight: bold;"><big>

</big></big></div>
}}}
[[PageOutline]]

= [intro] =

7 June 2011 - Apache Nutch 1.3 Released

= [get] =

[http://ftp.twaren.net/Unix/Web/apache/nutch/ get nutch]

{{{
cd /opt/nutch-1.3
ant
}}}



= [setup] =

== deploy ==

可將 bin/nutch 與 nutch-1.3.job 放到 hadoop 與之整合

== local ==

{{{
cd /opt/nutch-1.3/runtime/local
}}}
 * bin/nutch  (inject)

{{{
#!text
export JAVA_HOME="/usr/lib/jvm/java-6-sun"
}}}

 * conf/nutch-site.xml (inject)

{{{
#!text
<property>
  <name>http.agent.name</name>
  <value>waue_test</value>
</property>
}}}

 * conf/regex-urlfilter.txt  (replace) (1.2 conf/crawl-urlfilter.txt)

{{{
#!text
-^(file|ftp|mailto):

# skip image and other suffixes we can't yet parse
-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$

# skip URLs containing certain characters as probable queries, etc.
-[*!]

# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
#-.*(/[^/]+)/[^/]+\1/[^/]+\1/

# accept anything else
+.
}}}

= [execute] =

{{{
echo "http://lucene.apache.org/nutch/" >urls/url.txt
bin/nutch crawl urls -dir crawl2 -depth 2 -topN 50
}}}