| | 1 | {{{ |
| | 2 | #!html |
| | 3 | <div style="text-align: center; color:#151B8D"><big style="font-weight: bold;"><big><big> |
| | 4 | Nutch 1.3 |
| | 5 | </big></big></big></div> <div style="text-align: center; color:#7E2217"><big style="font-weight: bold;"><big> |
| | 6 | |
| | 7 | </big></big></div> |
| | 8 | }}} |
| | 9 | [[PageOutline]] |
| | 10 | |
| | 11 | 7 June 2011 - Apache Nutch 1.3 Released |
| | 12 | |
| | 13 | [http://ftp.twaren.net/Unix/Web/apache/nutch/ get nutch] |
| | 14 | |
| | 15 | * conf/nutch-site.xml |
| | 16 | |
| | 17 | {{{ |
| | 18 | #!text |
| | 19 | <property> |
| | 20 | <name>http.agent.name</name> |
| | 21 | <value>waue_test</value> |
| | 22 | </property> |
| | 23 | }}} |
| | 24 | |
| | 25 | * conf/crawl-urlfilter.txt |
| | 26 | |
| | 27 | {{{ |
| | 28 | #!text |
| | 29 | -^(file|ftp|mailto): |
| | 30 | |
| | 31 | # skip image and other suffixes we can't yet parse |
| | 32 | -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$ |
| | 33 | |
| | 34 | # skip URLs containing certain characters as probable queries, etc. |
| | 35 | -[*!] |
| | 36 | |
| | 37 | # skip URLs with slash-delimited segment that repeats 3+ times, to break loops |
| | 38 | #-.*(/[^/]+)/[^/]+\1/[^/]+\1/ |
| | 39 | |
| | 40 | # accept anything else |
| | 41 | +. |
| | 42 | }}} |
| | 43 | |
| | 44 | |
| | 45 | {{{ |
| | 46 | echo "http://lucene.apache.org/nutch/" >urls/url.txt |
| | 47 | bin/nutch crawl urls -dir crawl2 -depth 2 -topN 50 |
| | 48 | }}} |