source: nutchez-0.1/conf/nutch-site.xml @ 74

Last change on this file since 74 was 74, checked in by waue, 15 years ago

let it work

  • Property svn:executable set to *
File size: 2.3 KB
Line 
1<configuration>
2<property>
3  <name>http.agent.name</name>
4  <value>user</value>
5  <description>HTTP 'User-Agent' request header. </description> 
6</property>
7<property>
8  <name>http.agent.description</name>
9  <value>MyTest</value>
10  <description>Further description</description> 
11</property>
12<property>
13  <name>http.agent.url</name> 
14  <value>localhost</value> 
15  <description>A URL to advertise in the User-Agent header. </description> 
16</property>
17<property>
18  <name>http.agent.email</name>
19  <value>you@yous</value> 
20  <description>An email address
21  </description> 
22</property>
23<property>
24  <name>plugin.folders</name>
25  <value>/opt/nutch/plugins</value>
26  <description>Directories where nutch plugins are located. </description>
27</property>
28<property>
29  <name>plugin.includes</name>
30  <value>protocol-http|urlfilter-regex|parse-(text|html|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
31  <description> Regular expression naming plugin directory names</description>
32 </property>
33 <property>
34  <name>parse.plugin.file</name>
35  <value>parse-plugins.xml</value>
36  <description>The name of the file that defines the associations between
37  content-types and parsers.</description>
38 </property>
39 <property>
40   <name>db.max.outlinks.per.page</name>
41   <value>-1</value>
42   <description> </description>
43 </property> 
44 <property>
45   <name>http.content.limit</name> 
46   <value>-1</value>
47 </property>
48<property>
49  <name>indexer.mergeFactor</name>
50  <value>500</value>
51  <description>The factor that determines the frequency of Lucene segment
52  merges. This must not be less than 2, higher values increase indexing
53  speed but lead to increased RAM usage, and increase the number of
54  open file handles (which may lead to "Too many open files" errors).
55  NOTE: the "segments" here have nothing to do with Nutch segments, they
56  are a low-level data unit used by Lucene.
57  </description>
58</property>
59
60<property>
61  <name>indexer.minMergeDocs</name>
62  <value>500</value>
63  <description>This number determines the minimum number of Lucene
64  Documents buffered in memory between Lucene segment merges. Larger
65  values increase indexing speed and increase RAM usage.
66  </description>
67</property>
68
69</configuration>
70
Note: See TracBrowser for help on using the repository browser.