1 | <?xml version="1.0"?> |
---|
2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
---|
3 | <!-- |
---|
4 | Licensed to the Apache Software Foundation (ASF) under one or more |
---|
5 | contributor license agreements. See the NOTICE file distributed with |
---|
6 | this work for additional information regarding copyright ownership. |
---|
7 | The ASF licenses this file to You under the Apache License, Version 2.0 |
---|
8 | (the "License"); you may not use this file except in compliance with |
---|
9 | the License. You may obtain a copy of the License at |
---|
10 | |
---|
11 | http://www.apache.org/licenses/LICENSE-2.0 |
---|
12 | |
---|
13 | Unless required by applicable law or agreed to in writing, software |
---|
14 | distributed under the License is distributed on an "AS IS" BASIS, |
---|
15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
16 | See the License for the specific language governing permissions and |
---|
17 | limitations under the License. |
---|
18 | --> |
---|
19 | <!-- Do not modify this file directly. Instead, copy entries that you --> |
---|
20 | <!-- wish to modify from this file into nutch-site.xml and change them --> |
---|
21 | <!-- there. If nutch-site.xml does not already exist, create it. --> |
---|
22 | |
---|
23 | <configuration> |
---|
24 | |
---|
25 | <!-- file properties --> |
---|
26 | |
---|
27 | <property> |
---|
28 | <name>file.content.limit</name> |
---|
29 | <value>65536</value> |
---|
30 | <description>The length limit for downloaded content, in bytes. |
---|
31 | If this value is nonnegative (>=0), content longer than it will be truncated; |
---|
32 | otherwise, no truncation at all. |
---|
33 | </description> |
---|
34 | </property> |
---|
35 | |
---|
36 | <property> |
---|
37 | <name>file.content.ignored</name> |
---|
38 | <value>true</value> |
---|
39 | <description>If true, no file content will be saved during fetch. |
---|
40 | And it is probably what we want to set most of time, since file:// URLs |
---|
41 | are meant to be local and we can always use them directly at parsing |
---|
42 | and indexing stages. Otherwise file contents will be saved. |
---|
43 | !! NO IMPLEMENTED YET !! |
---|
44 | </description> |
---|
45 | </property> |
---|
46 | |
---|
47 | <!-- HTTP properties --> |
---|
48 | |
---|
49 | <property> |
---|
50 | <name>http.agent.name</name> |
---|
51 | <value></value> |
---|
52 | <description>HTTP 'User-Agent' request header. MUST NOT be empty - |
---|
53 | please set this to a single word uniquely related to your organization. |
---|
54 | |
---|
55 | NOTE: You should also check other related properties: |
---|
56 | |
---|
57 | http.robots.agents |
---|
58 | http.agent.description |
---|
59 | http.agent.url |
---|
60 | http.agent.email |
---|
61 | http.agent.version |
---|
62 | |
---|
63 | and set their values appropriately. |
---|
64 | |
---|
65 | </description> |
---|
66 | </property> |
---|
67 | |
---|
68 | <property> |
---|
69 | <name>http.robots.agents</name> |
---|
70 | <value>*</value> |
---|
71 | <description>The agent strings we'll look for in robots.txt files, |
---|
72 | comma-separated, in decreasing order of precedence. You should |
---|
73 | put the value of http.agent.name as the first agent name, and keep the |
---|
74 | default * at the end of the list. E.g.: BlurflDev,Blurfl,* |
---|
75 | </description> |
---|
76 | </property> |
---|
77 | |
---|
78 | <property> |
---|
79 | <name>http.robots.403.allow</name> |
---|
80 | <value>true</value> |
---|
81 | <description>Some servers return HTTP status 403 (Forbidden) if |
---|
82 | /robots.txt doesn't exist. This should probably mean that we are |
---|
83 | allowed to crawl the site nonetheless. If this is set to false, |
---|
84 | then such sites will be treated as forbidden.</description> |
---|
85 | </property> |
---|
86 | |
---|
87 | <property> |
---|
88 | <name>http.agent.description</name> |
---|
89 | <value></value> |
---|
90 | <description>Further description of our bot- this text is used in |
---|
91 | the User-Agent header. It appears in parenthesis after the agent name. |
---|
92 | </description> |
---|
93 | </property> |
---|
94 | |
---|
95 | <property> |
---|
96 | <name>http.agent.url</name> |
---|
97 | <value></value> |
---|
98 | <description>A URL to advertise in the User-Agent header. This will |
---|
99 | appear in parenthesis after the agent name. Custom dictates that this |
---|
100 | should be a URL of a page explaining the purpose and behavior of this |
---|
101 | crawler. |
---|
102 | </description> |
---|
103 | </property> |
---|
104 | |
---|
105 | <property> |
---|
106 | <name>http.agent.email</name> |
---|
107 | <value></value> |
---|
108 | <description>An email address to advertise in the HTTP 'From' request |
---|
109 | header and User-Agent header. A good practice is to mangle this |
---|
110 | address (e.g. 'info at example dot com') to avoid spamming. |
---|
111 | </description> |
---|
112 | </property> |
---|
113 | |
---|
114 | <property> |
---|
115 | <name>http.agent.version</name> |
---|
116 | <value>Nutch-1.0</value> |
---|
117 | <description>A version string to advertise in the User-Agent |
---|
118 | header.</description> |
---|
119 | </property> |
---|
120 | |
---|
121 | <property> |
---|
122 | <name>http.agent.host</name> |
---|
123 | <value></value> |
---|
124 | <description>Name or IP address of the host on which the Nutch crawler |
---|
125 | would be running. Currently this is used by 'protocol-httpclient' |
---|
126 | plugin. |
---|
127 | </description> |
---|
128 | </property> |
---|
129 | |
---|
130 | <property> |
---|
131 | <name>http.timeout</name> |
---|
132 | <value>10000</value> |
---|
133 | <description>The default network timeout, in milliseconds.</description> |
---|
134 | </property> |
---|
135 | |
---|
136 | <property> |
---|
137 | <name>http.max.delays</name> |
---|
138 | <value>100</value> |
---|
139 | <description>The number of times a thread will delay when trying to |
---|
140 | fetch a page. Each time it finds that a host is busy, it will wait |
---|
141 | fetcher.server.delay. After http.max.delays attepts, it will give |
---|
142 | up on the page for now.</description> |
---|
143 | </property> |
---|
144 | |
---|
145 | <property> |
---|
146 | <name>http.content.limit</name> |
---|
147 | <value>65536</value> |
---|
148 | <description>The length limit for downloaded content, in bytes. |
---|
149 | If this value is nonnegative (>=0), content longer than it will be truncated; |
---|
150 | otherwise, no truncation at all. |
---|
151 | </description> |
---|
152 | </property> |
---|
153 | |
---|
154 | <property> |
---|
155 | <name>http.proxy.host</name> |
---|
156 | <value></value> |
---|
157 | <description>The proxy hostname. If empty, no proxy is used.</description> |
---|
158 | </property> |
---|
159 | |
---|
160 | <property> |
---|
161 | <name>http.proxy.port</name> |
---|
162 | <value></value> |
---|
163 | <description>The proxy port.</description> |
---|
164 | </property> |
---|
165 | |
---|
166 | <property> |
---|
167 | <name>http.proxy.username</name> |
---|
168 | <value></value> |
---|
169 | <description>Username for proxy. This will be used by |
---|
170 | 'protocol-httpclient', if the proxy server requests basic, digest |
---|
171 | and/or NTLM authentication. To use this, 'protocol-httpclient' must |
---|
172 | be present in the value of 'plugin.includes' property. |
---|
173 | NOTE: For NTLM authentication, do not prefix the username with the |
---|
174 | domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect. |
---|
175 | </description> |
---|
176 | </property> |
---|
177 | |
---|
178 | <property> |
---|
179 | <name>http.proxy.password</name> |
---|
180 | <value></value> |
---|
181 | <description>Password for proxy. This will be used by |
---|
182 | 'protocol-httpclient', if the proxy server requests basic, digest |
---|
183 | and/or NTLM authentication. To use this, 'protocol-httpclient' must |
---|
184 | be present in the value of 'plugin.includes' property. |
---|
185 | </description> |
---|
186 | </property> |
---|
187 | |
---|
188 | <property> |
---|
189 | <name>http.proxy.realm</name> |
---|
190 | <value></value> |
---|
191 | <description>Authentication realm for proxy. Do not define a value |
---|
192 | if realm is not required or authentication should take place for any |
---|
193 | realm. NTLM does not use the notion of realms. Specify the domain name |
---|
194 | of NTLM authentication as the value for this property. To use this, |
---|
195 | 'protocol-httpclient' must be present in the value of |
---|
196 | 'plugin.includes' property. |
---|
197 | </description> |
---|
198 | </property> |
---|
199 | |
---|
200 | <property> |
---|
201 | <name>http.auth.file</name> |
---|
202 | <value>httpclient-auth.xml</value> |
---|
203 | <description>Authentication configuration file for |
---|
204 | 'protocol-httpclient' plugin. |
---|
205 | </description> |
---|
206 | </property> |
---|
207 | |
---|
208 | <property> |
---|
209 | <name>http.verbose</name> |
---|
210 | <value>false</value> |
---|
211 | <description>If true, HTTP will log more verbosely.</description> |
---|
212 | </property> |
---|
213 | |
---|
214 | <property> |
---|
215 | <name>http.redirect.max</name> |
---|
216 | <value>0</value> |
---|
217 | <description>The maximum number of redirects the fetcher will follow when |
---|
218 | trying to fetch a page. If set to negative or 0, fetcher won't immediately |
---|
219 | follow redirected URLs, instead it will record them for later fetching. |
---|
220 | </description> |
---|
221 | </property> |
---|
222 | |
---|
223 | <property> |
---|
224 | <name>http.useHttp11</name> |
---|
225 | <value>false</value> |
---|
226 | <description>NOTE: at the moment this works only for protocol-httpclient. |
---|
227 | If true, use HTTP 1.1, if false use HTTP 1.0 . |
---|
228 | </description> |
---|
229 | </property> |
---|
230 | |
---|
231 | <!-- FTP properties --> |
---|
232 | |
---|
233 | <property> |
---|
234 | <name>ftp.username</name> |
---|
235 | <value>anonymous</value> |
---|
236 | <description>ftp login username.</description> |
---|
237 | </property> |
---|
238 | |
---|
239 | <property> |
---|
240 | <name>ftp.password</name> |
---|
241 | <value>anonymous@example.com</value> |
---|
242 | <description>ftp login password.</description> |
---|
243 | </property> |
---|
244 | |
---|
245 | <property> |
---|
246 | <name>ftp.content.limit</name> |
---|
247 | <value>65536</value> |
---|
248 | <description>The length limit for downloaded content, in bytes. |
---|
249 | If this value is nonnegative (>=0), content longer than it will be truncated; |
---|
250 | otherwise, no truncation at all. |
---|
251 | Caution: classical ftp RFCs never defines partial transfer and, in fact, |
---|
252 | some ftp servers out there do not handle client side forced close-down very |
---|
253 | well. Our implementation tries its best to handle such situations smoothly. |
---|
254 | </description> |
---|
255 | </property> |
---|
256 | |
---|
257 | <property> |
---|
258 | <name>ftp.timeout</name> |
---|
259 | <value>60000</value> |
---|
260 | <description>Default timeout for ftp client socket, in millisec. |
---|
261 | Please also see ftp.keep.connection below.</description> |
---|
262 | </property> |
---|
263 | |
---|
264 | <property> |
---|
265 | <name>ftp.server.timeout</name> |
---|
266 | <value>100000</value> |
---|
267 | <description>An estimation of ftp server idle time, in millisec. |
---|
268 | Typically it is 120000 millisec for many ftp servers out there. |
---|
269 | Better be conservative here. Together with ftp.timeout, it is used to |
---|
270 | decide if we need to delete (annihilate) current ftp.client instance and |
---|
271 | force to start another ftp.client instance anew. This is necessary because |
---|
272 | a fetcher thread may not be able to obtain next request from queue in time |
---|
273 | (due to idleness) before our ftp client times out or remote server |
---|
274 | disconnects. Used only when ftp.keep.connection is true (please see below). |
---|
275 | </description> |
---|
276 | </property> |
---|
277 | |
---|
278 | <property> |
---|
279 | <name>ftp.keep.connection</name> |
---|
280 | <value>false</value> |
---|
281 | <description>Whether to keep ftp connection. Useful if crawling same host |
---|
282 | again and again. When set to true, it avoids connection, login and dir list |
---|
283 | parser setup for subsequent urls. If it is set to true, however, you must |
---|
284 | make sure (roughly): |
---|
285 | (1) ftp.timeout is less than ftp.server.timeout |
---|
286 | (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay) |
---|
287 | Otherwise there will be too many "delete client because idled too long" |
---|
288 | messages in thread logs.</description> |
---|
289 | </property> |
---|
290 | |
---|
291 | <property> |
---|
292 | <name>ftp.follow.talk</name> |
---|
293 | <value>false</value> |
---|
294 | <description>Whether to log dialogue between our client and remote |
---|
295 | server. Useful for debugging.</description> |
---|
296 | </property> |
---|
297 | |
---|
298 | <!-- web db properties --> |
---|
299 | |
---|
300 | <property> |
---|
301 | <name>db.default.fetch.interval</name> |
---|
302 | <value>30</value> |
---|
303 | <description>(DEPRECATED) The default number of days between re-fetches of a page. |
---|
304 | </description> |
---|
305 | </property> |
---|
306 | |
---|
307 | <property> |
---|
308 | <name>db.fetch.interval.default</name> |
---|
309 | <value>2592000</value> |
---|
310 | <description>The default number of seconds between re-fetches of a page (30 days). |
---|
311 | </description> |
---|
312 | </property> |
---|
313 | |
---|
314 | <property> |
---|
315 | <name>db.fetch.interval.max</name> |
---|
316 | <value>7776000</value> |
---|
317 | <description>The maximum number of seconds between re-fetches of a page |
---|
318 | (90 days). After this period every page in the db will be re-tried, no |
---|
319 | matter what is its status. |
---|
320 | </description> |
---|
321 | </property> |
---|
322 | |
---|
323 | <property> |
---|
324 | <name>db.fetch.schedule.class</name> |
---|
325 | <value>org.apache.nutch.crawl.DefaultFetchSchedule</value> |
---|
326 | <description>The implementation of fetch schedule. DefaultFetchSchedule simply |
---|
327 | adds the original fetchInterval to the last fetch time, regardless of |
---|
328 | page changes.</description> |
---|
329 | </property> |
---|
330 | |
---|
331 | <property> |
---|
332 | <name>db.fetch.schedule.adaptive.inc_rate</name> |
---|
333 | <value>0.4</value> |
---|
334 | <description>If a page is unmodified, its fetchInterval will be |
---|
335 | increased by this rate. This value should not |
---|
336 | exceed 0.5, otherwise the algorithm becomes unstable.</description> |
---|
337 | </property> |
---|
338 | |
---|
339 | <property> |
---|
340 | <name>db.fetch.schedule.adaptive.dec_rate</name> |
---|
341 | <value>0.2</value> |
---|
342 | <description>If a page is modified, its fetchInterval will be |
---|
343 | decreased by this rate. This value should not |
---|
344 | exceed 0.5, otherwise the algorithm becomes unstable.</description> |
---|
345 | </property> |
---|
346 | |
---|
347 | <property> |
---|
348 | <name>db.fetch.schedule.adaptive.min_interval</name> |
---|
349 | <value>60.0</value> |
---|
350 | <description>Minimum fetchInterval, in seconds.</description> |
---|
351 | </property> |
---|
352 | |
---|
353 | <property> |
---|
354 | <name>db.fetch.schedule.adaptive.max_interval</name> |
---|
355 | <value>31536000.0</value> |
---|
356 | <description>Maximum fetchInterval, in seconds (365 days). |
---|
357 | NOTE: this is limited by db.fetch.interval.max. Pages with |
---|
358 | fetchInterval larger than db.fetch.interval.max |
---|
359 | will be fetched anyway.</description> |
---|
360 | </property> |
---|
361 | |
---|
362 | <property> |
---|
363 | <name>db.fetch.schedule.adaptive.sync_delta</name> |
---|
364 | <value>true</value> |
---|
365 | <description>If true, try to synchronize with the time of page change. |
---|
366 | by shifting the next fetchTime by a fraction (sync_rate) of the difference |
---|
367 | between the last modification time, and the last fetch time.</description> |
---|
368 | </property> |
---|
369 | |
---|
370 | <property> |
---|
371 | <name>db.fetch.schedule.adaptive.sync_delta_rate</name> |
---|
372 | <value>0.3</value> |
---|
373 | <description>See sync_delta for description. This value should not |
---|
374 | exceed 0.5, otherwise the algorithm becomes unstable.</description> |
---|
375 | </property> |
---|
376 | |
---|
377 | <property> |
---|
378 | <name>db.update.additions.allowed</name> |
---|
379 | <value>true</value> |
---|
380 | <description>If true, updatedb will add newly discovered URLs, if false |
---|
381 | only already existing URLs in the CrawlDb will be updated and no new |
---|
382 | URLs will be added. |
---|
383 | </description> |
---|
384 | </property> |
---|
385 | |
---|
386 | <property> |
---|
387 | <name>db.ignore.internal.links</name> |
---|
388 | <value>true</value> |
---|
389 | <description>If true, when adding new links to a page, links from |
---|
390 | the same host are ignored. This is an effective way to limit the |
---|
391 | size of the link database, keeping only the highest quality |
---|
392 | links. |
---|
393 | </description> |
---|
394 | </property> |
---|
395 | |
---|
396 | <property> |
---|
397 | <name>db.ignore.external.links</name> |
---|
398 | <value>false</value> |
---|
399 | <description>If true, outlinks leading from a page to external hosts |
---|
400 | will be ignored. This is an effective way to limit the crawl to include |
---|
401 | only initially injected hosts, without creating complex URLFilters. |
---|
402 | </description> |
---|
403 | </property> |
---|
404 | |
---|
405 | <property> |
---|
406 | <name>db.score.injected</name> |
---|
407 | <value>1.0</value> |
---|
408 | <description>The score of new pages added by the injector. |
---|
409 | </description> |
---|
410 | </property> |
---|
411 | |
---|
412 | <property> |
---|
413 | <name>db.score.link.external</name> |
---|
414 | <value>1.0</value> |
---|
415 | <description>The score factor for new pages added due to a link from |
---|
416 | another host relative to the referencing page's score. Scoring plugins |
---|
417 | may use this value to affect initial scores of external links. |
---|
418 | </description> |
---|
419 | </property> |
---|
420 | |
---|
421 | <property> |
---|
422 | <name>db.score.link.internal</name> |
---|
423 | <value>1.0</value> |
---|
424 | <description>The score factor for pages added due to a link from the |
---|
425 | same host, relative to the referencing page's score. Scoring plugins |
---|
426 | may use this value to affect initial scores of internal links. |
---|
427 | </description> |
---|
428 | </property> |
---|
429 | |
---|
430 | <property> |
---|
431 | <name>db.score.count.filtered</name> |
---|
432 | <value>false</value> |
---|
433 | <description>The score value passed to newly discovered pages is |
---|
434 | calculated as a fraction of the original page score divided by the |
---|
435 | number of outlinks. If this option is false, only the outlinks that passed |
---|
436 | URLFilters will count, if it's true then all outlinks will count. |
---|
437 | </description> |
---|
438 | </property> |
---|
439 | |
---|
440 | <property> |
---|
441 | <name>db.max.inlinks</name> |
---|
442 | <value>10000</value> |
---|
443 | <description>Maximum number of Inlinks per URL to be kept in LinkDb. |
---|
444 | If "invertlinks" finds more inlinks than this number, only the first |
---|
445 | N inlinks will be stored, and the rest will be discarded. |
---|
446 | </description> |
---|
447 | </property> |
---|
448 | |
---|
449 | <property> |
---|
450 | <name>db.max.outlinks.per.page</name> |
---|
451 | <value>100</value> |
---|
452 | <description>The maximum number of outlinks that we'll process for a page. |
---|
453 | If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks |
---|
454 | will be processed for a page; otherwise, all outlinks will be processed. |
---|
455 | </description> |
---|
456 | </property> |
---|
457 | |
---|
458 | <property> |
---|
459 | <name>db.max.anchor.length</name> |
---|
460 | <value>100</value> |
---|
461 | <description>The maximum number of characters permitted in an anchor. |
---|
462 | </description> |
---|
463 | </property> |
---|
464 | |
---|
465 | <property> |
---|
466 | <name>db.fetch.retry.max</name> |
---|
467 | <value>3</value> |
---|
468 | <description>The maximum number of times a url that has encountered |
---|
469 | recoverable errors is generated for fetch.</description> |
---|
470 | </property> |
---|
471 | |
---|
472 | <property> |
---|
473 | <name>db.signature.class</name> |
---|
474 | <value>org.apache.nutch.crawl.MD5Signature</value> |
---|
475 | <description>The default implementation of a page signature. Signatures |
---|
476 | created with this implementation will be used for duplicate detection |
---|
477 | and removal.</description> |
---|
478 | </property> |
---|
479 | |
---|
480 | <property> |
---|
481 | <name>db.signature.text_profile.min_token_len</name> |
---|
482 | <value>2</value> |
---|
483 | <description>Minimum token length to be included in the signature. |
---|
484 | </description> |
---|
485 | </property> |
---|
486 | |
---|
487 | <property> |
---|
488 | <name>db.signature.text_profile.quant_rate</name> |
---|
489 | <value>0.01</value> |
---|
490 | <description>Profile frequencies will be rounded down to a multiple of |
---|
491 | QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token |
---|
492 | frequency. If maxFreq > 1 then QUANT will be at least 2, which means that |
---|
493 | for longer texts tokens with frequency 1 will always be discarded. |
---|
494 | </description> |
---|
495 | </property> |
---|
496 | |
---|
497 | <!-- generate properties --> |
---|
498 | |
---|
499 | <property> |
---|
500 | <name>generate.max.per.host</name> |
---|
501 | <value>-1</value> |
---|
502 | <description>The maximum number of urls per host in a single |
---|
503 | fetchlist. -1 if unlimited.</description> |
---|
504 | </property> |
---|
505 | |
---|
506 | <property> |
---|
507 | <name>generate.max.per.host.by.ip</name> |
---|
508 | <value>false</value> |
---|
509 | <description>If false, same host names are counted. If true, |
---|
510 | hosts' IP addresses are resolved and the same IP-s are counted. |
---|
511 | |
---|
512 | -+-+-+- WARNING !!! -+-+-+- |
---|
513 | When set to true, Generator will create a lot of DNS lookup |
---|
514 | requests, rapidly. This may cause a DOS attack on |
---|
515 | remote DNS servers, not to mention increased external traffic |
---|
516 | and latency. For these reasons when using this option it is |
---|
517 | required that a local caching DNS be used.</description> |
---|
518 | </property> |
---|
519 | |
---|
520 | <property> |
---|
521 | <name>generate.update.crawldb</name> |
---|
522 | <value>false</value> |
---|
523 | <description>For highly-concurrent environments, where several |
---|
524 | generate/fetch/update cycles may overlap, setting this to true ensures |
---|
525 | that generate will create different fetchlists even without intervening |
---|
526 | updatedb-s, at the cost of running an additional job to update CrawlDB. |
---|
527 | If false, running generate twice without intervening |
---|
528 | updatedb will generate identical fetchlists.</description> |
---|
529 | </property> |
---|
530 | |
---|
531 | <!-- fetcher properties --> |
---|
532 | |
---|
533 | <property> |
---|
534 | <name>fetcher.server.delay</name> |
---|
535 | <value>5.0</value> |
---|
536 | <description>The number of seconds the fetcher will delay between |
---|
537 | successive requests to the same server.</description> |
---|
538 | </property> |
---|
539 | |
---|
540 | <property> |
---|
541 | <name>fetcher.server.min.delay</name> |
---|
542 | <value>0.0</value> |
---|
543 | <description>The minimum number of seconds the fetcher will delay between |
---|
544 | successive requests to the same server. This value is applicable ONLY |
---|
545 | if fetcher.threads.per.host is greater than 1 (i.e. the host blocking |
---|
546 | is turned off).</description> |
---|
547 | </property> |
---|
548 | |
---|
549 | <property> |
---|
550 | <name>fetcher.max.crawl.delay</name> |
---|
551 | <value>30</value> |
---|
552 | <description> |
---|
553 | If the Crawl-Delay in robots.txt is set to greater than this value (in |
---|
554 | seconds) then the fetcher will skip this page, generating an error report. |
---|
555 | If set to -1 the fetcher will never skip such pages and will wait the |
---|
556 | amount of time retrieved from robots.txt Crawl-Delay, however long that |
---|
557 | might be. |
---|
558 | </description> |
---|
559 | </property> |
---|
560 | |
---|
561 | <property> |
---|
562 | <name>fetcher.threads.fetch</name> |
---|
563 | <value>10</value> |
---|
564 | <description>The number of FetcherThreads the fetcher should use. |
---|
565 | This is also determines the maximum number of requests that are |
---|
566 | made at once (each FetcherThread handles one connection).</description> |
---|
567 | </property> |
---|
568 | |
---|
569 | <property> |
---|
570 | <name>fetcher.threads.per.host</name> |
---|
571 | <value>1</value> |
---|
572 | <description>This number is the maximum number of threads that |
---|
573 | should be allowed to access a host at one time.</description> |
---|
574 | </property> |
---|
575 | |
---|
576 | <property> |
---|
577 | <name>fetcher.threads.per.host.by.ip</name> |
---|
578 | <value>true</value> |
---|
579 | <description>If true, then fetcher will count threads by IP address, |
---|
580 | to which the URL's host name resolves. If false, only host name will be |
---|
581 | used. NOTE: this should be set to the same value as |
---|
582 | "generate.max.per.host.by.ip" - default settings are different only for |
---|
583 | reasons of backward-compatibility.</description> |
---|
584 | </property> |
---|
585 | |
---|
586 | <property> |
---|
587 | <name>fetcher.verbose</name> |
---|
588 | <value>false</value> |
---|
589 | <description>If true, fetcher will log more verbosely.</description> |
---|
590 | </property> |
---|
591 | |
---|
592 | <property> |
---|
593 | <name>fetcher.parse</name> |
---|
594 | <value>true</value> |
---|
595 | <description>If true, fetcher will parse content.</description> |
---|
596 | </property> |
---|
597 | |
---|
598 | <property> |
---|
599 | <name>fetcher.store.content</name> |
---|
600 | <value>true</value> |
---|
601 | <description>If true, fetcher will store content.</description> |
---|
602 | </property> |
---|
603 | |
---|
604 | <!-- indexer properties --> |
---|
605 | |
---|
606 | <property> |
---|
607 | <name>indexer.score.power</name> |
---|
608 | <value>0.5</value> |
---|
609 | <description>Determines the power of link analyis scores. Each |
---|
610 | pages's boost is set to <i>score<sup>scorePower</sup></i> where |
---|
611 | <i>score</i> is its link analysis score and <i>scorePower</i> is the |
---|
612 | value of this parameter. This is compiled into indexes, so, when |
---|
613 | this is changed, pages must be re-indexed for it to take |
---|
614 | effect.</description> |
---|
615 | </property> |
---|
616 | |
---|
617 | <property> |
---|
618 | <name>indexer.max.title.length</name> |
---|
619 | <value>100</value> |
---|
620 | <description>The maximum number of characters of a title that are indexed. |
---|
621 | </description> |
---|
622 | </property> |
---|
623 | |
---|
624 | <property> |
---|
625 | <name>indexer.max.tokens</name> |
---|
626 | <value>10000</value> |
---|
627 | <description> |
---|
628 | The maximum number of tokens that will be indexed for a single field |
---|
629 | in a document. This limits the amount of memory required for |
---|
630 | indexing, so that collections with very large files will not crash |
---|
631 | the indexing process by running out of memory. |
---|
632 | |
---|
633 | Note that this effectively truncates large documents, excluding |
---|
634 | from the index tokens that occur further in the document. If you |
---|
635 | know your source documents are large, be sure to set this value |
---|
636 | high enough to accomodate the expected size. If you set it to |
---|
637 | -1, then the only limit is your memory, but you should anticipate |
---|
638 | an OutOfMemoryError. |
---|
639 | </description> |
---|
640 | </property> |
---|
641 | |
---|
642 | <property> |
---|
643 | <name>indexer.mergeFactor</name> |
---|
644 | <value>50</value> |
---|
645 | <description>The factor that determines the frequency of Lucene segment |
---|
646 | merges. This must not be less than 2, higher values increase indexing |
---|
647 | speed but lead to increased RAM usage, and increase the number of |
---|
648 | open file handles (which may lead to "Too many open files" errors). |
---|
649 | NOTE: the "segments" here have nothing to do with Nutch segments, they |
---|
650 | are a low-level data unit used by Lucene. |
---|
651 | </description> |
---|
652 | </property> |
---|
653 | |
---|
654 | <property> |
---|
655 | <name>indexer.minMergeDocs</name> |
---|
656 | <value>50</value> |
---|
657 | <description>This number determines the minimum number of Lucene |
---|
658 | Documents buffered in memory between Lucene segment merges. Larger |
---|
659 | values increase indexing speed and increase RAM usage. |
---|
660 | </description> |
---|
661 | </property> |
---|
662 | |
---|
663 | <property> |
---|
664 | <name>indexer.maxMergeDocs</name> |
---|
665 | <value>2147483647</value> |
---|
666 | <description>This number determines the maximum number of Lucene |
---|
667 | Documents to be merged into a new Lucene segment. Larger values |
---|
668 | increase batch indexing speed and reduce the number of Lucene segments, |
---|
669 | which reduces the number of open file handles; however, this also |
---|
670 | decreases incremental indexing performance. |
---|
671 | </description> |
---|
672 | </property> |
---|
673 | |
---|
674 | <property> |
---|
675 | <name>indexer.termIndexInterval</name> |
---|
676 | <value>128</value> |
---|
677 | <description>Determines the fraction of terms which Lucene keeps in |
---|
678 | RAM when searching, to facilitate random-access. Smaller values use |
---|
679 | more memory but make searches somewhat faster. Larger values use |
---|
680 | less memory but make searches somewhat slower. |
---|
681 | </description> |
---|
682 | </property> |
---|
683 | |
---|
684 | <!-- indexingfilter plugin properties --> |
---|
685 | |
---|
686 | <property> |
---|
687 | <name>indexingfilter.order</name> |
---|
688 | <value></value> |
---|
689 | <description>The order by which index filters are applied. |
---|
690 | If empty, all available index filters (as dictated by properties |
---|
691 | plugin-includes and plugin-excludes above) are loaded and applied in system |
---|
692 | defined order. If not empty, only named filters are loaded and applied |
---|
693 | in given order. For example, if this property has value: |
---|
694 | org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter |
---|
695 | then BasicIndexingFilter is applied first, and MoreIndexingFilter second. |
---|
696 | |
---|
697 | Filter ordering might have impact on result if one filter depends on output of |
---|
698 | another filter. |
---|
699 | </description> |
---|
700 | </property> |
---|
701 | |
---|
702 | |
---|
703 | <!-- analysis properties --> |
---|
704 | |
---|
705 | <property> |
---|
706 | <name>analysis.common.terms.file</name> |
---|
707 | <value>common-terms.utf8</value> |
---|
708 | <description>The name of a file containing a list of common terms |
---|
709 | that should be indexed in n-grams.</description> |
---|
710 | </property> |
---|
711 | |
---|
712 | <!-- searcher properties --> |
---|
713 | |
---|
714 | <property> |
---|
715 | <name>searcher.dir</name> |
---|
716 | <value>crawl</value> |
---|
717 | <description> |
---|
718 | Path to root of crawl. This directory is searched (in |
---|
719 | order) for either the file search-servers.txt, containing a list of |
---|
720 | distributed search servers, or the directory "index" containing |
---|
721 | merged indexes, or the directory "segments" containing segment |
---|
722 | indexes. |
---|
723 | </description> |
---|
724 | </property> |
---|
725 | |
---|
726 | <property> |
---|
727 | <name>searcher.filter.cache.size</name> |
---|
728 | <value>16</value> |
---|
729 | <description> |
---|
730 | Maximum number of filters to cache. Filters can accelerate certain |
---|
731 | field-based queries, like language, document format, etc. Each |
---|
732 | filter requires one bit of RAM per page. So, with a 10 million page |
---|
733 | index, a cache size of 16 consumes two bytes per page, or 20MB. |
---|
734 | </description> |
---|
735 | </property> |
---|
736 | |
---|
737 | <property> |
---|
738 | <name>searcher.filter.cache.threshold</name> |
---|
739 | <value>0.05</value> |
---|
740 | <description> |
---|
741 | Filters are cached when their term is matched by more than this |
---|
742 | fraction of pages. For example, with a threshold of 0.05, and 10 |
---|
743 | million pages, the term must match more than 1/20, or 50,000 pages. |
---|
744 | So, if out of 10 million pages, 50% of pages are in English, and 2% |
---|
745 | are in Finnish, then, with a threshold of 0.05, searches for |
---|
746 | "lang:en" will use a cached filter, while searches for "lang:fi" |
---|
747 | will score all 20,000 finnish documents. |
---|
748 | </description> |
---|
749 | </property> |
---|
750 | |
---|
751 | <property> |
---|
752 | <name>searcher.hostgrouping.rawhits.factor</name> |
---|
753 | <value>2.0</value> |
---|
754 | <description> |
---|
755 | A factor that is used to determine the number of raw hits |
---|
756 | initially fetched, before host grouping is done. |
---|
757 | </description> |
---|
758 | </property> |
---|
759 | |
---|
760 | <property> |
---|
761 | <name>searcher.summary.context</name> |
---|
762 | <value>5</value> |
---|
763 | <description> |
---|
764 | The number of context terms to display preceding and following |
---|
765 | matching terms in a hit summary. |
---|
766 | </description> |
---|
767 | </property> |
---|
768 | |
---|
769 | <property> |
---|
770 | <name>searcher.summary.length</name> |
---|
771 | <value>20</value> |
---|
772 | <description> |
---|
773 | The total number of terms to display in a hit summary. |
---|
774 | </description> |
---|
775 | </property> |
---|
776 | |
---|
777 | <property> |
---|
778 | <name>searcher.max.hits</name> |
---|
779 | <value>-1</value> |
---|
780 | <description>If positive, search stops after this many hits are |
---|
781 | found. Setting this to small, positive values (e.g., 1000) can make |
---|
782 | searches much faster. With a sorted index, the quality of the hits |
---|
783 | suffers little.</description> |
---|
784 | </property> |
---|
785 | |
---|
786 | <property> |
---|
787 | <name>searcher.max.time.tick_count</name> |
---|
788 | <value>-1</value> |
---|
789 | <description>If positive value is defined here, limit search time for |
---|
790 | every request to this number of elapsed ticks (see the tick_length |
---|
791 | property below). The total maximum time for any search request will be |
---|
792 | then limited to tick_count * tick_length milliseconds. When search time |
---|
793 | is exceeded, partial results will be returned, and the total number of |
---|
794 | hits will be estimated. |
---|
795 | </description> |
---|
796 | </property> |
---|
797 | |
---|
798 | <property> |
---|
799 | <name>searcher.max.time.tick_length</name> |
---|
800 | <value>200</value> |
---|
801 | <description>The number of milliseconds between ticks. Larger values |
---|
802 | reduce the timer granularity (precision). Smaller values bring more |
---|
803 | overhead. |
---|
804 | </description> |
---|
805 | </property> |
---|
806 | |
---|
807 | <property> |
---|
808 | <name>searcher.num.handlers</name> |
---|
809 | <value>10</value> |
---|
810 | <description>The number of handlers for the distributed search server. |
---|
811 | </description> |
---|
812 | </property> |
---|
813 | |
---|
814 | <property> |
---|
815 | <name>searcher.max.hits.per.page</name> |
---|
816 | <value>1000</value> |
---|
817 | <description> The maximum number of hits to show per page. -1 if |
---|
818 | unlimited. If the number of hits requested by the user (via |
---|
819 | hitsPerPage parameter in the query string) is more than the value |
---|
820 | specified in this property, then this value is assumed as the number |
---|
821 | of hits per page. |
---|
822 | </description> |
---|
823 | </property> |
---|
824 | |
---|
825 | <!-- URL normalizer properties --> |
---|
826 | |
---|
827 | <property> |
---|
828 | <name>urlnormalizer.order</name> |
---|
829 | <value>org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer</value> |
---|
830 | <description>Order in which normalizers will run. If any of these isn't |
---|
831 | activated it will be silently skipped. If other normalizers not on the |
---|
832 | list are activated, they will run in random order after the ones |
---|
833 | specified here are run. |
---|
834 | </description> |
---|
835 | </property> |
---|
836 | |
---|
837 | <property> |
---|
838 | <name>urlnormalizer.regex.file</name> |
---|
839 | <value>regex-normalize.xml</value> |
---|
840 | <description>Name of the config file used by the RegexUrlNormalizer class. |
---|
841 | </description> |
---|
842 | </property> |
---|
843 | |
---|
844 | <property> |
---|
845 | <name>urlnormalizer.loop.count</name> |
---|
846 | <value>1</value> |
---|
847 | <description>Optionally loop through normalizers several times, to make |
---|
848 | sure that all transformations have been performed. |
---|
849 | </description> |
---|
850 | </property> |
---|
851 | |
---|
852 | <!-- mime properties --> |
---|
853 | |
---|
854 | <property> |
---|
855 | <name>mime.types.file</name> |
---|
856 | <value>tika-mimetypes.xml</value> |
---|
857 | <description>Name of file in CLASSPATH containing filename extension and |
---|
858 | magic sequence to mime types mapping information</description> |
---|
859 | </property> |
---|
860 | |
---|
861 | <property> |
---|
862 | <name>mime.type.magic</name> |
---|
863 | <value>true</value> |
---|
864 | <description>Defines if the mime content type detector uses magic resolution. |
---|
865 | </description> |
---|
866 | </property> |
---|
867 | |
---|
868 | <!-- plugin properties --> |
---|
869 | |
---|
870 | <property> |
---|
871 | <name>plugin.folders</name> |
---|
872 | <value>plugins</value> |
---|
873 | <description>Directories where nutch plugins are located. Each |
---|
874 | element may be a relative or absolute path. If absolute, it is used |
---|
875 | as is. If relative, it is searched for on the classpath.</description> |
---|
876 | </property> |
---|
877 | |
---|
878 | <property> |
---|
879 | <name>plugin.auto-activation</name> |
---|
880 | <value>true</value> |
---|
881 | <description>Defines if some plugins that are not activated regarding |
---|
882 | the plugin.includes and plugin.excludes properties must be automaticaly |
---|
883 | activated if they are needed by some actived plugins. |
---|
884 | </description> |
---|
885 | </property> |
---|
886 | |
---|
887 | <property> |
---|
888 | <name>plugin.includes</name> |
---|
889 | <value>protocol-http|urlfilter-regex|parse-(text|html|js)|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> |
---|
890 | <description>Regular expression naming plugin directory names to |
---|
891 | include. Any plugin not matching this expression is excluded. |
---|
892 | In any case you need at least include the nutch-extensionpoints plugin. By |
---|
893 | default Nutch includes crawling just HTML and plain text via HTTP, |
---|
894 | and basic indexing and search plugins. In order to use HTTPS please enable |
---|
895 | protocol-httpclient, but be aware of possible intermittent problems with the |
---|
896 | underlying commons-httpclient library. |
---|
897 | </description> |
---|
898 | </property> |
---|
899 | |
---|
900 | <property> |
---|
901 | <name>plugin.excludes</name> |
---|
902 | <value></value> |
---|
903 | <description>Regular expression naming plugin directory names to exclude. |
---|
904 | </description> |
---|
905 | </property> |
---|
906 | |
---|
907 | <!-- parser properties --> |
---|
908 | |
---|
909 | <property> |
---|
910 | <name>parse.plugin.file</name> |
---|
911 | <value>parse-plugins.xml</value> |
---|
912 | <description>The name of the file that defines the associations between |
---|
913 | content-types and parsers.</description> |
---|
914 | </property> |
---|
915 | |
---|
916 | <property> |
---|
917 | <name>parser.character.encoding.default</name> |
---|
918 | <value>windows-1252</value> |
---|
919 | <description>The character encoding to fall back to when no other information |
---|
920 | is available</description> |
---|
921 | </property> |
---|
922 | |
---|
923 | <property> |
---|
924 | <name>encodingdetector.charset.min.confidence</name> |
---|
925 | <value>-1</value> |
---|
926 | <description>A integer between 0-100 indicating minimum confidence value |
---|
927 | for charset auto-detection. Any negative value disables auto-detection. |
---|
928 | </description> |
---|
929 | </property> |
---|
930 | |
---|
931 | <property> |
---|
932 | <name>parser.caching.forbidden.policy</name> |
---|
933 | <value>content</value> |
---|
934 | <description>If a site (or a page) requests through its robot metatags |
---|
935 | that it should not be shown as cached content, apply this policy. Currently |
---|
936 | three keywords are recognized: "none" ignores any "noarchive" directives. |
---|
937 | "content" doesn't show the content, but shows summaries (snippets). |
---|
938 | "all" doesn't show either content or summaries.</description> |
---|
939 | </property> |
---|
940 | |
---|
941 | |
---|
942 | <property> |
---|
943 | <name>parser.html.impl</name> |
---|
944 | <value>neko</value> |
---|
945 | <description>HTML Parser implementation. Currently the following keywords |
---|
946 | are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup. |
---|
947 | </description> |
---|
948 | </property> |
---|
949 | |
---|
950 | <property> |
---|
951 | <name>parser.html.form.use_action</name> |
---|
952 | <value>false</value> |
---|
953 | <description>If true, HTML parser will collect URLs from form action |
---|
954 | attributes. This may lead to undesirable behavior (submitting empty |
---|
955 | forms during next fetch cycle). If false, form action attribute will |
---|
956 | be ignored.</description> |
---|
957 | </property> |
---|
958 | |
---|
959 | <property> |
---|
960 | <name>parser.html.outlinks.ignore_tags</name> |
---|
961 | <value></value> |
---|
962 | <description>Comma separated list of HTML tags, from which outlinks |
---|
963 | shouldn't be extracted. Nutch takes links from: a, area, form, frame, |
---|
964 | iframe, script, link, img. If you add any of those tags here, it |
---|
965 | won't be taken. Default is empty list. Probably reasonable value |
---|
966 | for most people would be "img,script,link".</description> |
---|
967 | </property> |
---|
968 | |
---|
969 | |
---|
970 | <!-- urlfilter plugin properties --> |
---|
971 | |
---|
972 | <property> |
---|
973 | <name>urlfilter.domain.file</name> |
---|
974 | <value>domain-urlfilter.txt</value> |
---|
975 | <description>Name of file on CLASSPATH containing either top level domains or |
---|
976 | hostnames used by urlfilter-domain (DomainURLFilter) plugin.</description> |
---|
977 | </property> |
---|
978 | |
---|
979 | <property> |
---|
980 | <name>urlfilter.regex.file</name> |
---|
981 | <value>regex-urlfilter.txt</value> |
---|
982 | <description>Name of file on CLASSPATH containing regular expressions |
---|
983 | used by urlfilter-regex (RegexURLFilter) plugin.</description> |
---|
984 | </property> |
---|
985 | |
---|
986 | <property> |
---|
987 | <name>urlfilter.automaton.file</name> |
---|
988 | <value>automaton-urlfilter.txt</value> |
---|
989 | <description>Name of file on CLASSPATH containing regular expressions |
---|
990 | used by urlfilter-automaton (AutomatonURLFilter) plugin.</description> |
---|
991 | </property> |
---|
992 | |
---|
993 | <property> |
---|
994 | <name>urlfilter.prefix.file</name> |
---|
995 | <value>prefix-urlfilter.txt</value> |
---|
996 | <description>Name of file on CLASSPATH containing url prefixes |
---|
997 | used by urlfilter-prefix (PrefixURLFilter) plugin.</description> |
---|
998 | </property> |
---|
999 | |
---|
1000 | <property> |
---|
1001 | <name>urlfilter.suffix.file</name> |
---|
1002 | <value>suffix-urlfilter.txt</value> |
---|
1003 | <description>Name of file on CLASSPATH containing url suffixes |
---|
1004 | used by urlfilter-suffix (SuffixURLFilter) plugin.</description> |
---|
1005 | </property> |
---|
1006 | |
---|
1007 | <property> |
---|
1008 | <name>urlfilter.order</name> |
---|
1009 | <value></value> |
---|
1010 | <description>The order by which url filters are applied. |
---|
1011 | If empty, all available url filters (as dictated by properties |
---|
1012 | plugin-includes and plugin-excludes above) are loaded and applied in system |
---|
1013 | defined order. If not empty, only named filters are loaded and applied |
---|
1014 | in given order. For example, if this property has value: |
---|
1015 | org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter |
---|
1016 | then RegexURLFilter is applied first, and PrefixURLFilter second. |
---|
1017 | Since all filters are AND'ed, filter ordering does not have impact |
---|
1018 | on end result, but it may have performance implication, depending |
---|
1019 | on relative expensiveness of filters. |
---|
1020 | </description> |
---|
1021 | </property> |
---|
1022 | |
---|
1023 | <!-- scoring filters properties --> |
---|
1024 | |
---|
1025 | <property> |
---|
1026 | <name>scoring.filter.order</name> |
---|
1027 | <value></value> |
---|
1028 | <description>The order in which scoring filters are applied. |
---|
1029 | This may be left empty (in which case all available scoring |
---|
1030 | filters will be applied in the order defined in plugin-includes |
---|
1031 | and plugin-excludes), or a space separated list of implementation |
---|
1032 | classes. |
---|
1033 | </description> |
---|
1034 | </property> |
---|
1035 | |
---|
1036 | <!-- clustering extension properties --> |
---|
1037 | |
---|
1038 | <property> |
---|
1039 | <name>extension.clustering.hits-to-cluster</name> |
---|
1040 | <value>100</value> |
---|
1041 | <description>Number of snippets retrieved for the clustering extension |
---|
1042 | if clustering extension is available and user requested results |
---|
1043 | to be clustered.</description> |
---|
1044 | </property> |
---|
1045 | |
---|
1046 | <property> |
---|
1047 | <name>extension.clustering.extension-name</name> |
---|
1048 | <value></value> |
---|
1049 | <description>Use the specified online clustering extension. If empty, |
---|
1050 | the first available extension will be used. The "name" here refers to an 'id' |
---|
1051 | attribute of the 'implementation' element in the plugin descriptor XML |
---|
1052 | file.</description> |
---|
1053 | </property> |
---|
1054 | |
---|
1055 | <!-- ontology extension properties --> |
---|
1056 | |
---|
1057 | <property> |
---|
1058 | <name>extension.ontology.extension-name</name> |
---|
1059 | <value></value> |
---|
1060 | <description>Use the specified online ontology extension. If empty, |
---|
1061 | the first available extension will be used. The "name" here refers to an 'id' |
---|
1062 | attribute of the 'implementation' element in the plugin descriptor XML |
---|
1063 | file.</description> |
---|
1064 | </property> |
---|
1065 | |
---|
1066 | <property> |
---|
1067 | <name>extension.ontology.urls</name> |
---|
1068 | <value> |
---|
1069 | </value> |
---|
1070 | <description>Urls of owl files, separated by spaces, such as |
---|
1071 | http://www.example.com/ontology/time.owl |
---|
1072 | http://www.example.com/ontology/space.owl |
---|
1073 | http://www.example.com/ontology/wine.owl |
---|
1074 | Or |
---|
1075 | file:/ontology/time.owl |
---|
1076 | file:/ontology/space.owl |
---|
1077 | file:/ontology/wine.owl |
---|
1078 | You have to make sure each url is valid. |
---|
1079 | By default, there is no owl file, so query refinement based on ontology |
---|
1080 | is silently ignored. |
---|
1081 | </description> |
---|
1082 | </property> |
---|
1083 | |
---|
1084 | <!-- query-basic plugin properties --> |
---|
1085 | |
---|
1086 | <property> |
---|
1087 | <name>query.url.boost</name> |
---|
1088 | <value>4.0</value> |
---|
1089 | <description> Used as a boost for url field in Lucene query. |
---|
1090 | </description> |
---|
1091 | </property> |
---|
1092 | |
---|
1093 | <property> |
---|
1094 | <name>query.anchor.boost</name> |
---|
1095 | <value>2.0</value> |
---|
1096 | <description> Used as a boost for anchor field in Lucene query. |
---|
1097 | </description> |
---|
1098 | </property> |
---|
1099 | |
---|
1100 | <property> |
---|
1101 | <name>query.title.boost</name> |
---|
1102 | <value>1.5</value> |
---|
1103 | <description> Used as a boost for title field in Lucene query. |
---|
1104 | </description> |
---|
1105 | </property> |
---|
1106 | |
---|
1107 | <property> |
---|
1108 | <name>query.host.boost</name> |
---|
1109 | <value>2.0</value> |
---|
1110 | <description> Used as a boost for host field in Lucene query. |
---|
1111 | </description> |
---|
1112 | </property> |
---|
1113 | |
---|
1114 | <property> |
---|
1115 | <name>query.phrase.boost</name> |
---|
1116 | <value>1.0</value> |
---|
1117 | <description> Used as a boost for phrase in Lucene query. |
---|
1118 | Multiplied by boost for field phrase is matched in. |
---|
1119 | </description> |
---|
1120 | </property> |
---|
1121 | |
---|
1122 | <!-- |
---|
1123 | <property> |
---|
1124 | <name>query.basic.description.boost</name> |
---|
1125 | <value>1.0</value> |
---|
1126 | <description> Declares a custom field and its boost to be added to the default fields of the Lucene query. |
---|
1127 | </description> |
---|
1128 | </property> |
---|
1129 | --> |
---|
1130 | |
---|
1131 | <!-- creative-commons plugin properties --> |
---|
1132 | |
---|
1133 | <property> |
---|
1134 | <name>query.cc.boost</name> |
---|
1135 | <value>0.0</value> |
---|
1136 | <description> Used as a boost for cc field in Lucene query. |
---|
1137 | </description> |
---|
1138 | </property> |
---|
1139 | |
---|
1140 | <!-- query-more plugin properties --> |
---|
1141 | |
---|
1142 | <property> |
---|
1143 | <name>query.type.boost</name> |
---|
1144 | <value>0.0</value> |
---|
1145 | <description> Used as a boost for type field in Lucene query. |
---|
1146 | </description> |
---|
1147 | </property> |
---|
1148 | |
---|
1149 | <!-- query-site plugin properties --> |
---|
1150 | |
---|
1151 | <property> |
---|
1152 | <name>query.site.boost</name> |
---|
1153 | <value>0.0</value> |
---|
1154 | <description> Used as a boost for site field in Lucene query. |
---|
1155 | </description> |
---|
1156 | </property> |
---|
1157 | |
---|
1158 | <!-- microformats-reltag plugin properties --> |
---|
1159 | |
---|
1160 | <property> |
---|
1161 | <name>query.tag.boost</name> |
---|
1162 | <value>1.0</value> |
---|
1163 | <description> Used as a boost for tag field in Lucene query. |
---|
1164 | </description> |
---|
1165 | </property> |
---|
1166 | |
---|
1167 | <!-- language-identifier plugin properties --> |
---|
1168 | |
---|
1169 | <property> |
---|
1170 | <name>lang.ngram.min.length</name> |
---|
1171 | <value>1</value> |
---|
1172 | <description> The minimum size of ngrams to uses to identify |
---|
1173 | language (must be between 1 and lang.ngram.max.length). |
---|
1174 | The larger is the range between lang.ngram.min.length and |
---|
1175 | lang.ngram.max.length, the better is the identification, but |
---|
1176 | the slowest it is. |
---|
1177 | </description> |
---|
1178 | </property> |
---|
1179 | |
---|
1180 | <property> |
---|
1181 | <name>lang.ngram.max.length</name> |
---|
1182 | <value>4</value> |
---|
1183 | <description> The maximum size of ngrams to uses to identify |
---|
1184 | language (must be between lang.ngram.min.length and 4). |
---|
1185 | The larger is the range between lang.ngram.min.length and |
---|
1186 | lang.ngram.max.length, the better is the identification, but |
---|
1187 | the slowest it is. |
---|
1188 | </description> |
---|
1189 | </property> |
---|
1190 | |
---|
1191 | <property> |
---|
1192 | <name>lang.analyze.max.length</name> |
---|
1193 | <value>2048</value> |
---|
1194 | <description> The maximum bytes of data to uses to indentify |
---|
1195 | the language (0 means full content analysis). |
---|
1196 | The larger is this value, the better is the analysis, but the |
---|
1197 | slowest it is. |
---|
1198 | </description> |
---|
1199 | </property> |
---|
1200 | |
---|
1201 | <property> |
---|
1202 | <name>query.lang.boost</name> |
---|
1203 | <value>0.0</value> |
---|
1204 | <description> Used as a boost for lang field in Lucene query. |
---|
1205 | </description> |
---|
1206 | </property> |
---|
1207 | |
---|
1208 | <!-- Temporary Hadoop 0.17.x workaround. --> |
---|
1209 | |
---|
1210 | <property> |
---|
1211 | <name>hadoop.job.history.user.location</name> |
---|
1212 | <value>${hadoop.log.dir}/history/user</value> |
---|
1213 | <description>Hadoop 0.17.x comes with a default setting to create |
---|
1214 | user logs inside the output path of the job. This breaks some |
---|
1215 | Hadoop classes, which expect the output to contain only |
---|
1216 | part-XXXXX files. This setting changes the output to a |
---|
1217 | subdirectory of the regular log directory. |
---|
1218 | </description> |
---|
1219 | </property> |
---|
1220 | |
---|
1221 | <!-- response writer properties --> |
---|
1222 | |
---|
1223 | <property> |
---|
1224 | <name>search.response.default.type</name> |
---|
1225 | <value>xml</value> |
---|
1226 | <description> |
---|
1227 | The default response type returned if none is specified. |
---|
1228 | </description> |
---|
1229 | </property> |
---|
1230 | |
---|
1231 | <property> |
---|
1232 | <name>search.response.default.lang</name> |
---|
1233 | <value>en</value> |
---|
1234 | <description> |
---|
1235 | The default response language if none is specified. |
---|
1236 | </description> |
---|
1237 | </property> |
---|
1238 | |
---|
1239 | <property> |
---|
1240 | <name>search.response.default.numrows</name> |
---|
1241 | <value>10</value> |
---|
1242 | <description> |
---|
1243 | The default number of rows to return if none is specified. |
---|
1244 | </description> |
---|
1245 | </property> |
---|
1246 | |
---|
1247 | <property> |
---|
1248 | <name>search.response.default.dedupfield</name> |
---|
1249 | <value>site</value> |
---|
1250 | <description> |
---|
1251 | The default dedup field if none is specified. |
---|
1252 | </description> |
---|
1253 | </property> |
---|
1254 | |
---|
1255 | <property> |
---|
1256 | <name>search.response.default.numdupes</name> |
---|
1257 | <value>1</value> |
---|
1258 | <description> |
---|
1259 | The default number of duplicates returned if none is specified. |
---|
1260 | </description> |
---|
1261 | </property> |
---|
1262 | |
---|
1263 | <property> |
---|
1264 | <name>searcher.response.maxage</name> |
---|
1265 | <value>86400</value> |
---|
1266 | <description> |
---|
1267 | The maxage of a response in seconds. Used in caching headers. |
---|
1268 | </description> |
---|
1269 | </property> |
---|
1270 | |
---|
1271 | <property> |
---|
1272 | <name>searcher.response.prettyprint</name> |
---|
1273 | <value>true</value> |
---|
1274 | <description> |
---|
1275 | Should the response output be pretty printed. Setting to true enables better |
---|
1276 | debugging, false removes unneeded spaces and gives better throughput. |
---|
1277 | </description> |
---|
1278 | </property> |
---|
1279 | |
---|
1280 | </configuration> |
---|