Version 2 (modified by waue, 13 years ago) (diff) |
---|
Nutch 1.3
[intro]
7 June 2011 - Apache Nutch 1.3 Released
[get]
[setup]
- bin/nutch (inject)
export JAVA_HOME="/usr/lib/jvm/java-6-sun"
- conf/nutch-site.xml (inject)
<property> <name>http.agent.name</name> <value>waue_test</value> </property>
- conf/crawl-urlfilter.txt (replace)
-^(file|ftp|mailto): # skip image and other suffixes we can't yet parse -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$ # skip URLs containing certain characters as probable queries, etc. -[*!] # skip URLs with slash-delimited segment that repeats 3+ times, to break loops #-.*(/[^/]+)/[^/]+\1/[^/]+\1/ # accept anything else +.
[execute]
echo "http://lucene.apache.org/nutch/" >urls/url.txt bin/nutch crawl urls -dir crawl2 -depth 2 -topN 50