1 | <?xml version="1.0" ?> |
---|
2 | <!-- |
---|
3 | Licensed to the Apache Software Foundation (ASF) under one or more |
---|
4 | contributor license agreements. See the NOTICE file distributed with |
---|
5 | this work for additional information regarding copyright ownership. |
---|
6 | The ASF licenses this file to You under the Apache License, Version 2.0 |
---|
7 | (the "License"); you may not use this file except in compliance with |
---|
8 | the License. You may obtain a copy of the License at |
---|
9 | |
---|
10 | http://www.apache.org/licenses/LICENSE-2.0 |
---|
11 | |
---|
12 | Unless required by applicable law or agreed to in writing, software |
---|
13 | distributed under the License is distributed on an "AS IS" BASIS, |
---|
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
15 | See the License for the specific language governing permissions and |
---|
16 | limitations under the License. |
---|
17 | --> |
---|
18 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
---|
19 | |
---|
20 | <!-- Overidden defaults for intranet use. --> |
---|
21 | |
---|
22 | <!-- Do not modify this file directly. Instead, copy entries that you --> |
---|
23 | <!-- wish to modify from this file into nutch-site.xml and change them --> |
---|
24 | <!-- there. If nutch-site.xml does not already exist, create it. --> |
---|
25 | |
---|
26 | <configuration> |
---|
27 | |
---|
28 | <property> |
---|
29 | <name>urlfilter.regex.file</name> |
---|
30 | <value>crawl-urlfilter.txt</value> |
---|
31 | </property> |
---|
32 | |
---|
33 | <property> |
---|
34 | <name>db.ignore.internal.links</name> |
---|
35 | <value>false</value> |
---|
36 | <description>If true, when adding new links to a page, links from |
---|
37 | the same host are ignored. This is an effective way to limit the |
---|
38 | size of the link database, keeping the only the highest quality |
---|
39 | links. |
---|
40 | </description> |
---|
41 | </property> |
---|
42 | |
---|
43 | <property> |
---|
44 | <name>fetcher.server.delay</name> |
---|
45 | <value>1.0</value> |
---|
46 | <description>The number of seconds the fetcher will delay between |
---|
47 | successive requests to the same server.</description> |
---|
48 | </property> |
---|
49 | |
---|
50 | <property> |
---|
51 | <name>http.max.delays</name> |
---|
52 | <value>1000</value> |
---|
53 | <description>The number of times a thread will delay when trying to |
---|
54 | fetch a page. When using the crawl tool there are likely to be very |
---|
55 | few different hosts, so we need to be willing to wait longer for |
---|
56 | each.</description> |
---|
57 | </property> |
---|
58 | |
---|
59 | </configuration> |
---|