[66] | 1 | <configuration> |
---|
| 2 | <property> |
---|
| 3 | <name>http.agent.name</name> |
---|
| 4 | <value>user</value> |
---|
| 5 | <description>HTTP 'User-Agent' request header. </description> |
---|
| 6 | </property> |
---|
| 7 | <property> |
---|
| 8 | <name>http.agent.description</name> |
---|
| 9 | <value>MyTest</value> |
---|
| 10 | <description>Further description</description> |
---|
| 11 | </property> |
---|
| 12 | <property> |
---|
| 13 | <name>http.agent.url</name> |
---|
| 14 | <value>localhost</value> |
---|
| 15 | <description>A URL to advertise in the User-Agent header. </description> |
---|
| 16 | </property> |
---|
| 17 | <property> |
---|
| 18 | <name>http.agent.email</name> |
---|
| 19 | <value>you@yous</value> |
---|
| 20 | <description>An email address |
---|
| 21 | </description> |
---|
| 22 | </property> |
---|
| 23 | <property> |
---|
[70] | 24 | <name>plugin.folders</name> |
---|
[100] | 25 | <value>/opt/nutchez/nutch/plugins</value> |
---|
[70] | 26 | <description>Directories where nutch plugins are located. </description> |
---|
| 27 | </property> |
---|
| 28 | <property> |
---|
[66] | 29 | <name>plugin.includes</name> |
---|
[74] | 30 | <value>protocol-http|urlfilter-regex|parse-(text|html|ext|msexcel|mspowerpoint|msword|oo|pdf|rss|zip)|index-(more|basic|anchor)|query-(more|basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> |
---|
[66] | 31 | <description> Regular expression naming plugin directory names</description> |
---|
| 32 | </property> |
---|
| 33 | <property> |
---|
| 34 | <name>parse.plugin.file</name> |
---|
| 35 | <value>parse-plugins.xml</value> |
---|
| 36 | <description>The name of the file that defines the associations between |
---|
| 37 | content-types and parsers.</description> |
---|
| 38 | </property> |
---|
| 39 | <property> |
---|
| 40 | <name>db.max.outlinks.per.page</name> |
---|
| 41 | <value>-1</value> |
---|
| 42 | <description> </description> |
---|
| 43 | </property> |
---|
| 44 | <property> |
---|
| 45 | <name>http.content.limit</name> |
---|
| 46 | <value>-1</value> |
---|
| 47 | </property> |
---|
| 48 | <property> |
---|
| 49 | <name>indexer.mergeFactor</name> |
---|
| 50 | <value>500</value> |
---|
| 51 | <description>The factor that determines the frequency of Lucene segment |
---|
| 52 | merges. This must not be less than 2, higher values increase indexing |
---|
| 53 | speed but lead to increased RAM usage, and increase the number of |
---|
| 54 | open file handles (which may lead to "Too many open files" errors). |
---|
| 55 | NOTE: the "segments" here have nothing to do with Nutch segments, they |
---|
| 56 | are a low-level data unit used by Lucene. |
---|
| 57 | </description> |
---|
| 58 | </property> |
---|
| 59 | |
---|
| 60 | <property> |
---|
| 61 | <name>indexer.minMergeDocs</name> |
---|
| 62 | <value>500</value> |
---|
| 63 | <description>This number determines the minimum number of Lucene |
---|
| 64 | Documents buffered in memory between Lucene segment merges. Larger |
---|
| 65 | values increase indexing speed and increase RAM usage. |
---|
| 66 | </description> |
---|
| 67 | </property> |
---|
[80] | 68 | <property> |
---|
| 69 | <name>db.ignore.external.links</name> |
---|
| 70 | <value>false</value> |
---|
| 71 | <description>If true, outlinks leading from a page to external hosts |
---|
| 72 | will be ignored. This is an effective way to limit the crawl to include |
---|
| 73 | only initially injected hosts, without creating complex URLFilters. |
---|
| 74 | </description> |
---|
| 75 | </property> |
---|
| 76 | <property> |
---|
| 77 | <name>file.content.limit</name> |
---|
| 78 | <value>1000000</value> |
---|
| 79 | <description>The length limit for downloaded content, in bytes. |
---|
| 80 | If this value is nonnegative (>=0), content longer than it will be truncated; |
---|
| 81 | otherwise, no truncation at all. |
---|
| 82 | </description> |
---|
| 83 | </property> |
---|
[66] | 84 | </configuration> |
---|
| 85 | |
---|