source: nutchez-0.1/conf/nutch-site.xml @ 66

Last change on this file since 66 was 66, checked in by waue, 15 years ago

NutchEz - an easy way to nutch

  • Property svn:executable set to *
File size: 2.2 KB
Line 
1<configuration>
2<property>
3  <name>http.agent.name</name>
4  <value>user</value>
5  <description>HTTP 'User-Agent' request header. </description> 
6</property>
7<property>
8  <name>http.agent.description</name>
9  <value>MyTest</value>
10  <description>Further description</description> 
11</property>
12<property>
13  <name>http.agent.url</name> 
14  <value>localhost</value> 
15  <description>A URL to advertise in the User-Agent header. </description> 
16</property>
17<property>
18  <name>http.agent.email</name>
19  <value>you@yous</value> 
20  <description>An email address
21  </description> 
22</property>
23<property>
24  <name>plugin.includes</name>
25  <value>protocol-(http|httpclient)|urlfilter-regex|parse-(text|html|js|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|swf|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
26  <description> Regular expression naming plugin directory names</description>
27 </property>
28 <property>
29  <name>parse.plugin.file</name>
30  <value>parse-plugins.xml</value>
31  <description>The name of the file that defines the associations between
32  content-types and parsers.</description>
33 </property>
34 <property>
35   <name>db.max.outlinks.per.page</name>
36   <value>-1</value>
37   <description> </description>
38 </property> 
39 <property>
40   <name>http.content.limit</name> 
41   <value>-1</value>
42 </property>
43<property>
44  <name>indexer.mergeFactor</name>
45  <value>500</value>
46  <description>The factor that determines the frequency of Lucene segment
47  merges. This must not be less than 2, higher values increase indexing
48  speed but lead to increased RAM usage, and increase the number of
49  open file handles (which may lead to "Too many open files" errors).
50  NOTE: the "segments" here have nothing to do with Nutch segments, they
51  are a low-level data unit used by Lucene.
52  </description>
53</property>
54
55<property>
56  <name>indexer.minMergeDocs</name>
57  <value>500</value>
58  <description>This number determines the minimum number of Lucene
59  Documents buffered in memory between Lucene segment merges. Larger
60  values increase indexing speed and increase RAM usage.
61  </description>
62</property>
63
64</configuration>
65
Note: See TracBrowser for help on using the repository browser.