[66] | 1 | <?xml version="1.0" ?> |
---|
| 2 | <!-- |
---|
| 3 | Licensed to the Apache Software Foundation (ASF) under one or more |
---|
| 4 | contributor license agreements. See the NOTICE file distributed with |
---|
| 5 | this work for additional information regarding copyright ownership. |
---|
| 6 | The ASF licenses this file to You under the Apache License, Version 2.0 |
---|
| 7 | (the "License"); you may not use this file except in compliance with |
---|
| 8 | the License. You may obtain a copy of the License at |
---|
| 9 | |
---|
| 10 | http://www.apache.org/licenses/LICENSE-2.0 |
---|
| 11 | |
---|
| 12 | Unless required by applicable law or agreed to in writing, software |
---|
| 13 | distributed under the License is distributed on an "AS IS" BASIS, |
---|
| 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
| 15 | See the License for the specific language governing permissions and |
---|
| 16 | limitations under the License. |
---|
| 17 | --> |
---|
| 18 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
---|
| 19 | |
---|
| 20 | <!-- Overidden defaults for intranet use. --> |
---|
| 21 | |
---|
| 22 | <!-- Do not modify this file directly. Instead, copy entries that you --> |
---|
| 23 | <!-- wish to modify from this file into nutch-site.xml and change them --> |
---|
| 24 | <!-- there. If nutch-site.xml does not already exist, create it. --> |
---|
| 25 | |
---|
| 26 | <configuration> |
---|
| 27 | |
---|
| 28 | <property> |
---|
| 29 | <name>urlfilter.regex.file</name> |
---|
| 30 | <value>crawl-urlfilter.txt</value> |
---|
| 31 | </property> |
---|
| 32 | |
---|
| 33 | <property> |
---|
| 34 | <name>db.ignore.internal.links</name> |
---|
| 35 | <value>false</value> |
---|
| 36 | <description>If true, when adding new links to a page, links from |
---|
| 37 | the same host are ignored. This is an effective way to limit the |
---|
| 38 | size of the link database, keeping the only the highest quality |
---|
| 39 | links. |
---|
| 40 | </description> |
---|
| 41 | </property> |
---|
| 42 | |
---|
| 43 | <property> |
---|
| 44 | <name>fetcher.server.delay</name> |
---|
| 45 | <value>1.0</value> |
---|
| 46 | <description>The number of seconds the fetcher will delay between |
---|
| 47 | successive requests to the same server.</description> |
---|
| 48 | </property> |
---|
| 49 | |
---|
| 50 | <property> |
---|
| 51 | <name>http.max.delays</name> |
---|
| 52 | <value>1000</value> |
---|
| 53 | <description>The number of times a thread will delay when trying to |
---|
| 54 | fetch a page. When using the crawl tool there are likely to be very |
---|
| 55 | few different hosts, so we need to be willing to wait longer for |
---|
| 56 | each.</description> |
---|
| 57 | </property> |
---|
| 58 | |
---|
| 59 | </configuration> |
---|