| 1 | {{{ |
| 2 | #!html |
| 3 | <div style="text-align: center; color:#151B8D"><big style="font-weight: bold;"><big><big> |
| 4 | Nutch 1.3 |
| 5 | </big></big></big></div> <div style="text-align: center; color:#7E2217"><big style="font-weight: bold;"><big> |
| 6 | |
| 7 | </big></big></div> |
| 8 | }}} |
| 9 | [[PageOutline]] |
| 10 | |
| 11 | 7 June 2011 - Apache Nutch 1.3 Released |
| 12 | |
| 13 | [http://ftp.twaren.net/Unix/Web/apache/nutch/ get nutch] |
| 14 | |
| 15 | * conf/nutch-site.xml |
| 16 | |
| 17 | {{{ |
| 18 | #!text |
| 19 | <property> |
| 20 | <name>http.agent.name</name> |
| 21 | <value>waue_test</value> |
| 22 | </property> |
| 23 | }}} |
| 24 | |
| 25 | * conf/crawl-urlfilter.txt |
| 26 | |
| 27 | {{{ |
| 28 | #!text |
| 29 | -^(file|ftp|mailto): |
| 30 | |
| 31 | # skip image and other suffixes we can't yet parse |
| 32 | -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$ |
| 33 | |
| 34 | # skip URLs containing certain characters as probable queries, etc. |
| 35 | -[*!] |
| 36 | |
| 37 | # skip URLs with slash-delimited segment that repeats 3+ times, to break loops |
| 38 | #-.*(/[^/]+)/[^/]+\1/[^/]+\1/ |
| 39 | |
| 40 | # accept anything else |
| 41 | +. |
| 42 | }}} |
| 43 | |
| 44 | |
| 45 | {{{ |
| 46 | echo "http://lucene.apache.org/nutch/" >urls/url.txt |
| 47 | bin/nutch crawl urls -dir crawl2 -depth 2 -topN 50 |
| 48 | }}} |