source: nutchez-0.1/tomcat/webapps/ROOT/WEB-INF/classes/crawl-tool.xml @ 98

Last change on this file since 98 was 66, checked in by waue, 16 years ago

NutchEz - an easy way to nutch

File size: 2.1 KB
Line 
1<?xml version="1.0" ?> 
2<!--
3 Licensed to the Apache Software Foundation (ASF) under one or more
4  contributor license agreements.  See the NOTICE file distributed with
5  this work for additional information regarding copyright ownership.
6  The ASF licenses this file to You under the Apache License, Version 2.0
7  (the "License"); you may not use this file except in compliance with
8  the License.  You may obtain a copy of the License at
9
10      http://www.apache.org/licenses/LICENSE-2.0
11
12  Unless required by applicable law or agreed to in writing, software
13  distributed under the License is distributed on an "AS IS" BASIS,
14  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  See the License for the specific language governing permissions and
16  limitations under the License.
17-->
18<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
19
20<!-- Overidden defaults for intranet use. -->
21
22<!-- Do not modify this file directly.  Instead, copy entries that you -->
23<!-- wish to modify from this file into nutch-site.xml and change them -->
24<!-- there.  If nutch-site.xml does not already exist, create it.      -->
25
26<configuration>
27
28<property>
29  <name>urlfilter.regex.file</name>
30  <value>crawl-urlfilter.txt</value>
31</property>
32
33<property>
34  <name>db.ignore.internal.links</name>
35  <value>false</value>
36  <description>If true, when adding new links to a page, links from
37  the same host are ignored.  This is an effective way to limit the
38  size of the link database, keeping the only the highest quality
39  links.
40  </description>
41</property>
42
43<property>
44  <name>fetcher.server.delay</name>
45  <value>1.0</value>
46  <description>The number of seconds the fetcher will delay between
47   successive requests to the same server.</description>
48</property>
49
50<property>
51  <name>http.max.delays</name>
52  <value>1000</value>
53  <description>The number of times a thread will delay when trying to
54  fetch a page.  When using the crawl tool there are likely to be very
55  few different hosts, so we need to be willing to wait longer for
56  each.</description>
57</property>
58
59</configuration>
Note: See TracBrowser for help on using the repository browser.