1 | <configuration> |
---|
2 | <property> |
---|
3 | <name>http.agent.name</name> |
---|
4 | <value>user</value> |
---|
5 | <description>HTTP 'User-Agent' request header. </description> |
---|
6 | </property> |
---|
7 | <property> |
---|
8 | <name>http.agent.description</name> |
---|
9 | <value>MyTest</value> |
---|
10 | <description>Further description</description> |
---|
11 | </property> |
---|
12 | <property> |
---|
13 | <name>http.agent.url</name> |
---|
14 | <value>localhost</value> |
---|
15 | <description>A URL to advertise in the User-Agent header. </description> |
---|
16 | </property> |
---|
17 | <property> |
---|
18 | <name>http.agent.email</name> |
---|
19 | <value>you@yous</value> |
---|
20 | <description>An email address |
---|
21 | </description> |
---|
22 | </property> |
---|
23 | <property> |
---|
24 | <name>plugin.folders</name> |
---|
25 | <value>/opt/nutchez/nutch/plugins</value> |
---|
26 | <description>Directories where nutch plugins are located. </description> |
---|
27 | </property> |
---|
28 | <property> |
---|
29 | <name>plugin.includes</name> |
---|
30 | <value>protocol-http|urlfilter-regex|parse-(text|html|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> |
---|
31 | <description> Regular expression naming plugin directory names</description> |
---|
32 | </property> |
---|
33 | <property> |
---|
34 | <name>parse.plugin.file</name> |
---|
35 | <value>parse-plugins.xml</value> |
---|
36 | <description>The name of the file that defines the associations between |
---|
37 | content-types and parsers.</description> |
---|
38 | </property> |
---|
39 | <property> |
---|
40 | <name>db.max.outlinks.per.page</name> |
---|
41 | <value>-1</value> |
---|
42 | <description> </description> |
---|
43 | </property> |
---|
44 | <property> |
---|
45 | <name>http.content.limit</name> |
---|
46 | <value>-1</value> |
---|
47 | </property> |
---|
48 | <property> |
---|
49 | <name>indexer.mergeFactor</name> |
---|
50 | <value>500</value> |
---|
51 | <description>The factor that determines the frequency of Lucene segment |
---|
52 | merges. This must not be less than 2, higher values increase indexing |
---|
53 | speed but lead to increased RAM usage, and increase the number of |
---|
54 | open file handles (which may lead to "Too many open files" errors). |
---|
55 | NOTE: the "segments" here have nothing to do with Nutch segments, they |
---|
56 | are a low-level data unit used by Lucene. |
---|
57 | </description> |
---|
58 | </property> |
---|
59 | |
---|
60 | <property> |
---|
61 | <name>indexer.minMergeDocs</name> |
---|
62 | <value>500</value> |
---|
63 | <description>This number determines the minimum number of Lucene |
---|
64 | Documents buffered in memory between Lucene segment merges. Larger |
---|
65 | values increase indexing speed and increase RAM usage. |
---|
66 | </description> |
---|
67 | </property> |
---|
68 | <property> |
---|
69 | <name>db.ignore.external.links</name> |
---|
70 | <value>false</value> |
---|
71 | <description>If true, outlinks leading from a page to external hosts |
---|
72 | will be ignored. This is an effective way to limit the crawl to include |
---|
73 | only initially injected hosts, without creating complex URLFilters. |
---|
74 | </description> |
---|
75 | </property> |
---|
76 | <property> |
---|
77 | <name>file.content.limit</name> |
---|
78 | <value>1000000</value> |
---|
79 | <description>The length limit for downloaded content, in bytes. |
---|
80 | If this value is nonnegative (>=0), content longer than it will be truncated; |
---|
81 | otherwise, no truncation at all. |
---|
82 | </description> |
---|
83 | </property> |
---|
84 | </configuration> |
---|
85 | |
---|