source: nutchez-0.1/conf/nutch-site.xml @ 80

Last change on this file since 80 was 80, checked in by waue, 15 years ago

NutchEz ..0.1

  • Property svn:executable set to *
File size: 2.9 KB
Line 
1<configuration>
2<property>
3  <name>http.agent.name</name>
4  <value>user</value>
5  <description>HTTP 'User-Agent' request header. </description> 
6</property>
7<property>
8  <name>http.agent.description</name>
9  <value>MyTest</value>
10  <description>Further description</description> 
11</property>
12<property>
13  <name>http.agent.url</name> 
14  <value>localhost</value> 
15  <description>A URL to advertise in the User-Agent header. </description> 
16</property>
17<property>
18  <name>http.agent.email</name>
19  <value>you@yous</value> 
20  <description>An email address
21  </description> 
22</property>
23<property>
24  <name>plugin.folders</name>
25  <value>/opt/nutch/plugins</value>
26  <description>Directories where nutch plugins are located. </description>
27</property>
28<property>
29  <name>plugin.includes</name>
30  <value>protocol-http|urlfilter-regex|parse-(text|html|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
31  <description> Regular expression naming plugin directory names</description>
32 </property>
33 <property>
34  <name>parse.plugin.file</name>
35  <value>parse-plugins.xml</value>
36  <description>The name of the file that defines the associations between
37  content-types and parsers.</description>
38 </property>
39 <property>
40   <name>db.max.outlinks.per.page</name>
41   <value>-1</value>
42   <description> </description>
43 </property> 
44 <property>
45   <name>http.content.limit</name> 
46   <value>-1</value>
47 </property>
48<property>
49  <name>indexer.mergeFactor</name>
50  <value>500</value>
51  <description>The factor that determines the frequency of Lucene segment
52  merges. This must not be less than 2, higher values increase indexing
53  speed but lead to increased RAM usage, and increase the number of
54  open file handles (which may lead to "Too many open files" errors).
55  NOTE: the "segments" here have nothing to do with Nutch segments, they
56  are a low-level data unit used by Lucene.
57  </description>
58</property>
59
60<property>
61  <name>indexer.minMergeDocs</name>
62  <value>500</value>
63  <description>This number determines the minimum number of Lucene
64  Documents buffered in memory between Lucene segment merges. Larger
65  values increase indexing speed and increase RAM usage.
66  </description>
67</property>
68<property>
69  <name>db.ignore.external.links</name>
70  <value>false</value>
71  <description>If true, outlinks leading from a page to external hosts
72  will be ignored. This is an effective way to limit the crawl to include
73  only initially injected hosts, without creating complex URLFilters.
74  </description>
75</property>
76<property>
77  <name>file.content.limit</name>
78  <value>1000000</value>
79  <description>The length limit for downloaded content, in bytes.
80  If this value is nonnegative (>=0), content longer than it will be truncated;
81  otherwise, no truncation at all.
82  </description>
83</property>
84</configuration>
85
Note: See TracBrowser for help on using the repository browser.