source: nutchez-0.1/tomcat/webapps/ROOT/WEB-INF/classes/nutch-default.xml @ 98

Last change on this file since 98 was 66, checked in by waue, 16 years ago

NutchEz - an easy way to nutch

File size: 40.7 KB
Line 
1<?xml version="1.0"?>
2<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3<!--
4 Licensed to the Apache Software Foundation (ASF) under one or more
5 contributor license agreements.  See the NOTICE file distributed with
6 this work for additional information regarding copyright ownership.
7 The ASF licenses this file to You under the Apache License, Version 2.0
8 (the "License"); you may not use this file except in compliance with
9 the License.  You may obtain a copy of the License at
10
11     http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing, software
14 distributed under the License is distributed on an "AS IS" BASIS,
15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 See the License for the specific language governing permissions and
17 limitations under the License.
18-->
19<!-- Do not modify this file directly.  Instead, copy entries that you -->
20<!-- wish to modify from this file into nutch-site.xml and change them -->
21<!-- there.  If nutch-site.xml does not already exist, create it.      -->
22
23<configuration>
24
25<!-- file properties -->
26
27<property>
28  <name>file.content.limit</name>
29  <value>65536</value>
30  <description>The length limit for downloaded content, in bytes.
31  If this value is nonnegative (>=0), content longer than it will be truncated;
32  otherwise, no truncation at all.
33  </description>
34</property>
35
36<property>
37  <name>file.content.ignored</name>
38  <value>true</value>
39  <description>If true, no file content will be saved during fetch.
40  And it is probably what we want to set most of time, since file:// URLs
41  are meant to be local and we can always use them directly at parsing
42  and indexing stages. Otherwise file contents will be saved.
43  !! NO IMPLEMENTED YET !!
44  </description>
45</property>
46
47<!-- HTTP properties -->
48
49<property>
50  <name>http.agent.name</name>
51  <value></value>
52  <description>HTTP 'User-Agent' request header. MUST NOT be empty -
53  please set this to a single word uniquely related to your organization.
54
55  NOTE: You should also check other related properties:
56
57  http.robots.agents
58  http.agent.description
59  http.agent.url
60  http.agent.email
61  http.agent.version
62
63  and set their values appropriately.
64
65  </description>
66</property>
67
68<property>
69  <name>http.robots.agents</name>
70  <value>*</value>
71  <description>The agent strings we'll look for in robots.txt files,
72  comma-separated, in decreasing order of precedence. You should
73  put the value of http.agent.name as the first agent name, and keep the
74  default * at the end of the list. E.g.: BlurflDev,Blurfl,*
75  </description>
76</property>
77
78<property>
79  <name>http.robots.403.allow</name>
80  <value>true</value>
81  <description>Some servers return HTTP status 403 (Forbidden) if
82  /robots.txt doesn't exist. This should probably mean that we are
83  allowed to crawl the site nonetheless. If this is set to false,
84  then such sites will be treated as forbidden.</description>
85</property>
86
87<property>
88  <name>http.agent.description</name>
89  <value></value>
90  <description>Further description of our bot- this text is used in
91  the User-Agent header.  It appears in parenthesis after the agent name.
92  </description>
93</property>
94
95<property>
96  <name>http.agent.url</name>
97  <value></value>
98  <description>A URL to advertise in the User-Agent header.  This will
99   appear in parenthesis after the agent name. Custom dictates that this
100   should be a URL of a page explaining the purpose and behavior of this
101   crawler.
102  </description>
103</property>
104
105<property>
106  <name>http.agent.email</name>
107  <value></value>
108  <description>An email address to advertise in the HTTP 'From' request
109   header and User-Agent header. A good practice is to mangle this
110   address (e.g. 'info at example dot com') to avoid spamming.
111  </description>
112</property>
113
114<property>
115  <name>http.agent.version</name>
116  <value>Nutch-1.0</value>
117  <description>A version string to advertise in the User-Agent
118   header.</description>
119</property>
120
121<property>
122  <name>http.agent.host</name>
123  <value></value>
124  <description>Name or IP address of the host on which the Nutch crawler
125  would be running. Currently this is used by 'protocol-httpclient'
126  plugin.
127  </description>
128</property>
129
130<property>
131  <name>http.timeout</name>
132  <value>10000</value>
133  <description>The default network timeout, in milliseconds.</description>
134</property>
135
136<property>
137  <name>http.max.delays</name>
138  <value>100</value>
139  <description>The number of times a thread will delay when trying to
140  fetch a page.  Each time it finds that a host is busy, it will wait
141  fetcher.server.delay.  After http.max.delays attepts, it will give
142  up on the page for now.</description>
143</property>
144
145<property>
146  <name>http.content.limit</name>
147  <value>65536</value>
148  <description>The length limit for downloaded content, in bytes.
149  If this value is nonnegative (>=0), content longer than it will be truncated;
150  otherwise, no truncation at all.
151  </description>
152</property>
153
154<property>
155  <name>http.proxy.host</name>
156  <value></value>
157  <description>The proxy hostname.  If empty, no proxy is used.</description>
158</property>
159
160<property>
161  <name>http.proxy.port</name>
162  <value></value>
163  <description>The proxy port.</description>
164</property>
165
166<property>
167  <name>http.proxy.username</name>
168  <value></value>
169  <description>Username for proxy. This will be used by
170  'protocol-httpclient', if the proxy server requests basic, digest
171  and/or NTLM authentication. To use this, 'protocol-httpclient' must
172  be present in the value of 'plugin.includes' property.
173  NOTE: For NTLM authentication, do not prefix the username with the
174  domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect.
175  </description>
176</property>
177
178<property>
179  <name>http.proxy.password</name>
180  <value></value>
181  <description>Password for proxy. This will be used by
182  'protocol-httpclient', if the proxy server requests basic, digest
183  and/or NTLM authentication. To use this, 'protocol-httpclient' must
184  be present in the value of 'plugin.includes' property.
185  </description>
186</property>
187
188<property>
189  <name>http.proxy.realm</name>
190  <value></value>
191  <description>Authentication realm for proxy. Do not define a value
192  if realm is not required or authentication should take place for any
193  realm. NTLM does not use the notion of realms. Specify the domain name
194  of NTLM authentication as the value for this property. To use this,
195  'protocol-httpclient' must be present in the value of
196  'plugin.includes' property.
197  </description>
198</property>
199
200<property>
201  <name>http.auth.file</name>
202  <value>httpclient-auth.xml</value>
203  <description>Authentication configuration file for
204  'protocol-httpclient' plugin.
205  </description>
206</property>
207
208<property>
209  <name>http.verbose</name>
210  <value>false</value>
211  <description>If true, HTTP will log more verbosely.</description>
212</property>
213
214<property>
215  <name>http.redirect.max</name>
216  <value>0</value>
217  <description>The maximum number of redirects the fetcher will follow when
218  trying to fetch a page. If set to negative or 0, fetcher won't immediately
219  follow redirected URLs, instead it will record them for later fetching.
220  </description>
221</property>
222
223<property>
224  <name>http.useHttp11</name>
225  <value>false</value>
226  <description>NOTE: at the moment this works only for protocol-httpclient.
227  If true, use HTTP 1.1, if false use HTTP 1.0 .
228  </description>
229</property>
230
231<!-- FTP properties -->
232
233<property>
234  <name>ftp.username</name>
235  <value>anonymous</value>
236  <description>ftp login username.</description>
237</property>
238
239<property>
240  <name>ftp.password</name>
241  <value>anonymous@example.com</value>
242  <description>ftp login password.</description>
243</property>
244
245<property>
246  <name>ftp.content.limit</name>
247  <value>65536</value> 
248  <description>The length limit for downloaded content, in bytes.
249  If this value is nonnegative (>=0), content longer than it will be truncated;
250  otherwise, no truncation at all.
251  Caution: classical ftp RFCs never defines partial transfer and, in fact,
252  some ftp servers out there do not handle client side forced close-down very
253  well. Our implementation tries its best to handle such situations smoothly.
254  </description>
255</property>
256
257<property>
258  <name>ftp.timeout</name>
259  <value>60000</value>
260  <description>Default timeout for ftp client socket, in millisec.
261  Please also see ftp.keep.connection below.</description>
262</property>
263
264<property>
265  <name>ftp.server.timeout</name>
266  <value>100000</value>
267  <description>An estimation of ftp server idle time, in millisec.
268  Typically it is 120000 millisec for many ftp servers out there.
269  Better be conservative here. Together with ftp.timeout, it is used to
270  decide if we need to delete (annihilate) current ftp.client instance and
271  force to start another ftp.client instance anew. This is necessary because
272  a fetcher thread may not be able to obtain next request from queue in time
273  (due to idleness) before our ftp client times out or remote server
274  disconnects. Used only when ftp.keep.connection is true (please see below).
275  </description>
276</property>
277
278<property>
279  <name>ftp.keep.connection</name>
280  <value>false</value>
281  <description>Whether to keep ftp connection. Useful if crawling same host
282  again and again. When set to true, it avoids connection, login and dir list
283  parser setup for subsequent urls. If it is set to true, however, you must
284  make sure (roughly):
285  (1) ftp.timeout is less than ftp.server.timeout
286  (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay)
287  Otherwise there will be too many "delete client because idled too long"
288  messages in thread logs.</description>
289</property>
290
291<property>
292  <name>ftp.follow.talk</name>
293  <value>false</value>
294  <description>Whether to log dialogue between our client and remote
295  server. Useful for debugging.</description>
296</property>
297
298<!-- web db properties -->
299
300<property>
301  <name>db.default.fetch.interval</name>
302  <value>30</value>
303  <description>(DEPRECATED) The default number of days between re-fetches of a page.
304  </description>
305</property>
306
307<property>
308  <name>db.fetch.interval.default</name>
309  <value>2592000</value>
310  <description>The default number of seconds between re-fetches of a page (30 days).
311  </description>
312</property>
313
314<property>
315  <name>db.fetch.interval.max</name>
316  <value>7776000</value>
317  <description>The maximum number of seconds between re-fetches of a page
318  (90 days). After this period every page in the db will be re-tried, no
319  matter what is its status.
320  </description>
321</property>
322
323<property>
324  <name>db.fetch.schedule.class</name>
325  <value>org.apache.nutch.crawl.DefaultFetchSchedule</value>
326  <description>The implementation of fetch schedule. DefaultFetchSchedule simply
327  adds the original fetchInterval to the last fetch time, regardless of
328  page changes.</description>
329</property>
330
331<property>
332  <name>db.fetch.schedule.adaptive.inc_rate</name>
333  <value>0.4</value>
334  <description>If a page is unmodified, its fetchInterval will be
335  increased by this rate. This value should not
336  exceed 0.5, otherwise the algorithm becomes unstable.</description>
337</property>
338
339<property>
340  <name>db.fetch.schedule.adaptive.dec_rate</name>
341  <value>0.2</value>
342  <description>If a page is modified, its fetchInterval will be
343  decreased by this rate. This value should not
344  exceed 0.5, otherwise the algorithm becomes unstable.</description>
345</property>
346
347<property>
348  <name>db.fetch.schedule.adaptive.min_interval</name>
349  <value>60.0</value>
350  <description>Minimum fetchInterval, in seconds.</description>
351</property>
352
353<property>
354  <name>db.fetch.schedule.adaptive.max_interval</name>
355  <value>31536000.0</value>
356  <description>Maximum fetchInterval, in seconds (365 days).
357  NOTE: this is limited by db.fetch.interval.max. Pages with
358  fetchInterval larger than db.fetch.interval.max
359  will be fetched anyway.</description>
360</property>
361
362<property>
363  <name>db.fetch.schedule.adaptive.sync_delta</name>
364  <value>true</value>
365  <description>If true, try to synchronize with the time of page change.
366  by shifting the next fetchTime by a fraction (sync_rate) of the difference
367  between the last modification time, and the last fetch time.</description>
368</property>
369
370<property>
371  <name>db.fetch.schedule.adaptive.sync_delta_rate</name>
372  <value>0.3</value>
373  <description>See sync_delta for description. This value should not
374  exceed 0.5, otherwise the algorithm becomes unstable.</description>
375</property>
376
377<property>
378  <name>db.update.additions.allowed</name>
379  <value>true</value>
380  <description>If true, updatedb will add newly discovered URLs, if false
381  only already existing URLs in the CrawlDb will be updated and no new
382  URLs will be added.
383  </description>
384</property>
385
386<property>
387  <name>db.ignore.internal.links</name>
388  <value>true</value>
389  <description>If true, when adding new links to a page, links from
390  the same host are ignored.  This is an effective way to limit the
391  size of the link database, keeping only the highest quality
392  links.
393  </description>
394</property>
395
396<property>
397  <name>db.ignore.external.links</name>
398  <value>false</value>
399  <description>If true, outlinks leading from a page to external hosts
400  will be ignored. This is an effective way to limit the crawl to include
401  only initially injected hosts, without creating complex URLFilters.
402  </description>
403</property>
404
405<property>
406  <name>db.score.injected</name>
407  <value>1.0</value>
408  <description>The score of new pages added by the injector.
409  </description>
410</property>
411
412<property>
413  <name>db.score.link.external</name>
414  <value>1.0</value>
415  <description>The score factor for new pages added due to a link from
416  another host relative to the referencing page's score. Scoring plugins
417  may use this value to affect initial scores of external links.
418  </description>
419</property>
420
421<property>
422  <name>db.score.link.internal</name>
423  <value>1.0</value>
424  <description>The score factor for pages added due to a link from the
425  same host, relative to the referencing page's score. Scoring plugins
426  may use this value to affect initial scores of internal links.
427  </description>
428</property>
429
430<property>
431  <name>db.score.count.filtered</name>
432  <value>false</value>
433  <description>The score value passed to newly discovered pages is
434  calculated as a fraction of the original page score divided by the
435  number of outlinks. If this option is false, only the outlinks that passed
436  URLFilters will count, if it's true then all outlinks will count.
437  </description>
438</property>
439
440<property>
441  <name>db.max.inlinks</name>
442  <value>10000</value>
443  <description>Maximum number of Inlinks per URL to be kept in LinkDb.
444  If "invertlinks" finds more inlinks than this number, only the first
445  N inlinks will be stored, and the rest will be discarded.
446  </description>
447</property>
448
449<property>
450  <name>db.max.outlinks.per.page</name>
451  <value>100</value>
452  <description>The maximum number of outlinks that we'll process for a page.
453  If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks
454  will be processed for a page; otherwise, all outlinks will be processed.
455  </description>
456</property>
457
458<property>
459  <name>db.max.anchor.length</name>
460  <value>100</value>
461  <description>The maximum number of characters permitted in an anchor.
462  </description>
463</property>
464
465<property>
466  <name>db.fetch.retry.max</name>
467  <value>3</value>
468  <description>The maximum number of times a url that has encountered
469  recoverable errors is generated for fetch.</description>
470</property>
471
472<property>
473  <name>db.signature.class</name>
474  <value>org.apache.nutch.crawl.MD5Signature</value>
475  <description>The default implementation of a page signature. Signatures
476  created with this implementation will be used for duplicate detection
477  and removal.</description>
478</property>
479
480<property>
481  <name>db.signature.text_profile.min_token_len</name>
482  <value>2</value>
483  <description>Minimum token length to be included in the signature.
484  </description>
485</property>
486
487<property>
488  <name>db.signature.text_profile.quant_rate</name>
489  <value>0.01</value>
490  <description>Profile frequencies will be rounded down to a multiple of
491  QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token
492  frequency. If maxFreq > 1 then QUANT will be at least 2, which means that
493  for longer texts tokens with frequency 1 will always be discarded.
494  </description>
495</property>
496
497<!-- generate properties -->
498
499<property>
500  <name>generate.max.per.host</name>
501  <value>-1</value>
502  <description>The maximum number of urls per host in a single
503  fetchlist.  -1 if unlimited.</description>
504</property>
505
506<property>
507  <name>generate.max.per.host.by.ip</name>
508  <value>false</value>
509  <description>If false, same host names are counted. If true,
510  hosts' IP addresses are resolved and the same IP-s are counted.
511 
512  -+-+-+- WARNING !!! -+-+-+-
513  When set to true, Generator will create a lot of DNS lookup
514  requests, rapidly. This may cause a DOS attack on
515  remote DNS servers, not to mention increased external traffic
516  and latency. For these reasons when using this option it is
517  required that a local caching DNS be used.</description>
518</property>
519
520<property>
521  <name>generate.update.crawldb</name>
522  <value>false</value>
523  <description>For highly-concurrent environments, where several
524  generate/fetch/update cycles may overlap, setting this to true ensures
525  that generate will create different fetchlists even without intervening
526  updatedb-s, at the cost of running an additional job to update CrawlDB.
527  If false, running generate twice without intervening
528  updatedb will generate identical fetchlists.</description>
529</property>
530
531<!-- fetcher properties -->
532
533<property>
534  <name>fetcher.server.delay</name>
535  <value>5.0</value>
536  <description>The number of seconds the fetcher will delay between
537   successive requests to the same server.</description>
538</property>
539
540<property>
541  <name>fetcher.server.min.delay</name>
542  <value>0.0</value>
543  <description>The minimum number of seconds the fetcher will delay between
544  successive requests to the same server. This value is applicable ONLY
545  if fetcher.threads.per.host is greater than 1 (i.e. the host blocking
546  is turned off).</description>
547</property>
548
549<property>
550 <name>fetcher.max.crawl.delay</name>
551 <value>30</value>
552 <description>
553 If the Crawl-Delay in robots.txt is set to greater than this value (in
554 seconds) then the fetcher will skip this page, generating an error report.
555 If set to -1 the fetcher will never skip such pages and will wait the
556 amount of time retrieved from robots.txt Crawl-Delay, however long that
557 might be.
558 </description>
559</property> 
560
561<property>
562  <name>fetcher.threads.fetch</name>
563  <value>10</value>
564  <description>The number of FetcherThreads the fetcher should use.
565    This is also determines the maximum number of requests that are
566    made at once (each FetcherThread handles one connection).</description>
567</property>
568
569<property>
570  <name>fetcher.threads.per.host</name>
571  <value>1</value>
572  <description>This number is the maximum number of threads that
573    should be allowed to access a host at one time.</description>
574</property>
575
576<property>
577  <name>fetcher.threads.per.host.by.ip</name>
578  <value>true</value>
579  <description>If true, then fetcher will count threads by IP address,
580  to which the URL's host name resolves. If false, only host name will be
581  used. NOTE: this should be set to the same value as
582  "generate.max.per.host.by.ip" - default settings are different only for
583  reasons of backward-compatibility.</description>
584</property>
585
586<property>
587  <name>fetcher.verbose</name>
588  <value>false</value>
589  <description>If true, fetcher will log more verbosely.</description>
590</property>
591
592<property>
593  <name>fetcher.parse</name>
594  <value>true</value>
595  <description>If true, fetcher will parse content.</description>
596</property>
597
598<property>
599  <name>fetcher.store.content</name>
600  <value>true</value>
601  <description>If true, fetcher will store content.</description>
602</property>
603
604<!-- indexer properties -->
605
606<property>
607  <name>indexer.score.power</name>
608  <value>0.5</value>
609  <description>Determines the power of link analyis scores.  Each
610  pages's boost is set to <i>score<sup>scorePower</sup></i> where
611  <i>score</i> is its link analysis score and <i>scorePower</i> is the
612  value of this parameter.  This is compiled into indexes, so, when
613  this is changed, pages must be re-indexed for it to take
614  effect.</description>
615</property>
616
617<property>
618  <name>indexer.max.title.length</name>
619  <value>100</value>
620  <description>The maximum number of characters of a title that are indexed.
621  </description>
622</property>
623
624<property>
625  <name>indexer.max.tokens</name>
626  <value>10000</value>
627  <description>
628  The maximum number of tokens that will be indexed for a single field
629  in a document. This limits the amount of memory required for
630  indexing, so that collections with very large files will not crash
631  the indexing process by running out of memory.
632
633  Note that this effectively truncates large documents, excluding
634  from the index tokens that occur further in the document. If you
635  know your source documents are large, be sure to set this value
636  high enough to accomodate the expected size. If you set it to
637  -1, then the only limit is your memory, but you should anticipate
638  an OutOfMemoryError.
639  </description>
640</property>
641
642<property>
643  <name>indexer.mergeFactor</name>
644  <value>50</value>
645  <description>The factor that determines the frequency of Lucene segment
646  merges. This must not be less than 2, higher values increase indexing
647  speed but lead to increased RAM usage, and increase the number of
648  open file handles (which may lead to "Too many open files" errors).
649  NOTE: the "segments" here have nothing to do with Nutch segments, they
650  are a low-level data unit used by Lucene.
651  </description>
652</property>
653
654<property>
655  <name>indexer.minMergeDocs</name>
656  <value>50</value>
657  <description>This number determines the minimum number of Lucene
658  Documents buffered in memory between Lucene segment merges. Larger
659  values increase indexing speed and increase RAM usage.
660  </description>
661</property>
662
663<property>
664  <name>indexer.maxMergeDocs</name>
665  <value>2147483647</value>
666  <description>This number determines the maximum number of Lucene
667  Documents to be merged into a new Lucene segment. Larger values
668  increase batch indexing speed and reduce the number of Lucene segments,
669  which reduces the number of open file handles; however, this also
670  decreases incremental indexing performance.
671  </description>
672</property>
673
674<property>
675  <name>indexer.termIndexInterval</name>
676  <value>128</value>
677  <description>Determines the fraction of terms which Lucene keeps in
678  RAM when searching, to facilitate random-access.  Smaller values use
679  more memory but make searches somewhat faster.  Larger values use
680  less memory but make searches somewhat slower.
681  </description>
682</property>
683
684<!-- indexingfilter plugin properties -->
685
686<property>
687  <name>indexingfilter.order</name>
688  <value></value>
689  <description>The order by which index filters are applied.
690  If empty, all available index filters (as dictated by properties
691  plugin-includes and plugin-excludes above) are loaded and applied in system
692  defined order. If not empty, only named filters are loaded and applied
693  in given order. For example, if this property has value:
694  org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter
695  then BasicIndexingFilter is applied first, and MoreIndexingFilter second.
696 
697  Filter ordering might have impact on result if one filter depends on output of
698  another filter.
699  </description>
700</property>
701
702
703<!-- analysis properties -->
704
705<property>
706  <name>analysis.common.terms.file</name>
707  <value>common-terms.utf8</value>
708  <description>The name of a file containing a list of common terms
709  that should be indexed in n-grams.</description>
710</property>
711
712<!-- searcher properties -->
713
714<property>
715  <name>searcher.dir</name>
716  <value>crawl</value>
717  <description>
718  Path to root of crawl.  This directory is searched (in
719  order) for either the file search-servers.txt, containing a list of
720  distributed search servers, or the directory "index" containing
721  merged indexes, or the directory "segments" containing segment
722  indexes.
723  </description>
724</property>
725
726<property>
727  <name>searcher.filter.cache.size</name>
728  <value>16</value>
729  <description>
730  Maximum number of filters to cache.  Filters can accelerate certain
731  field-based queries, like language, document format, etc.  Each
732  filter requires one bit of RAM per page.  So, with a 10 million page
733  index, a cache size of 16 consumes two bytes per page, or 20MB.
734  </description>
735</property>
736
737<property>
738  <name>searcher.filter.cache.threshold</name>
739  <value>0.05</value>
740  <description>
741  Filters are cached when their term is matched by more than this
742  fraction of pages.  For example, with a threshold of 0.05, and 10
743  million pages, the term must match more than 1/20, or 50,000 pages.
744  So, if out of 10 million pages, 50% of pages are in English, and 2%
745  are in Finnish, then, with a threshold of 0.05, searches for
746  "lang:en" will use a cached filter, while searches for "lang:fi"
747  will score all 20,000 finnish documents.
748  </description>
749</property>
750
751<property>
752  <name>searcher.hostgrouping.rawhits.factor</name>
753  <value>2.0</value>
754  <description>
755  A factor that is used to determine the number of raw hits
756  initially fetched, before host grouping is done.
757  </description>
758</property>
759
760<property>
761  <name>searcher.summary.context</name>
762  <value>5</value>
763  <description>
764  The number of context terms to display preceding and following
765  matching terms in a hit summary.
766  </description>
767</property>
768
769<property>
770  <name>searcher.summary.length</name>
771  <value>20</value>
772  <description>
773  The total number of terms to display in a hit summary.
774  </description>
775</property>
776
777<property>
778  <name>searcher.max.hits</name>
779  <value>-1</value>
780  <description>If positive, search stops after this many hits are
781  found.  Setting this to small, positive values (e.g., 1000) can make
782  searches much faster.  With a sorted index, the quality of the hits
783  suffers little.</description>
784</property>
785
786<property>
787  <name>searcher.max.time.tick_count</name>
788  <value>-1</value>
789  <description>If positive value is defined here, limit search time for
790  every request to this number of elapsed ticks (see the tick_length
791  property below). The total maximum time for any search request will be
792  then limited to tick_count * tick_length milliseconds. When search time
793  is exceeded, partial results will be returned, and the total number of
794  hits will be estimated.
795  </description>
796</property>
797
798<property>
799  <name>searcher.max.time.tick_length</name>
800  <value>200</value>
801  <description>The number of milliseconds between ticks. Larger values
802  reduce the timer granularity (precision). Smaller values bring more
803  overhead.
804  </description>
805</property>
806
807<property>
808  <name>searcher.num.handlers</name>
809  <value>10</value>
810  <description>The number of handlers for the distributed search server.
811  </description>
812</property>
813
814<property>
815  <name>searcher.max.hits.per.page</name>
816  <value>1000</value>
817  <description> The maximum number of hits to show per page. -1 if
818    unlimited. If the number of hits requested by the user (via
819    hitsPerPage parameter in the query string) is more than the value
820    specified in this property, then this value is assumed as the number
821    of hits per page.
822  </description>
823</property>
824
825<!-- URL normalizer properties -->
826
827<property>
828  <name>urlnormalizer.order</name>
829  <value>org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer</value>
830  <description>Order in which normalizers will run. If any of these isn't
831  activated it will be silently skipped. If other normalizers not on the
832  list are activated, they will run in random order after the ones
833  specified here are run.
834  </description>
835</property>
836
837<property>
838  <name>urlnormalizer.regex.file</name>
839  <value>regex-normalize.xml</value>
840  <description>Name of the config file used by the RegexUrlNormalizer class.
841  </description>
842</property>
843
844<property>
845  <name>urlnormalizer.loop.count</name>
846  <value>1</value>
847  <description>Optionally loop through normalizers several times, to make
848  sure that all transformations have been performed.
849  </description>
850</property>
851
852<!-- mime properties -->
853
854<property>
855  <name>mime.types.file</name>
856  <value>tika-mimetypes.xml</value>
857  <description>Name of file in CLASSPATH containing filename extension and
858  magic sequence to mime types mapping information</description>
859</property>
860
861<property>
862  <name>mime.type.magic</name>
863  <value>true</value>
864  <description>Defines if the mime content type detector uses magic resolution.
865  </description>
866</property>
867
868<!-- plugin properties -->
869
870<property>
871  <name>plugin.folders</name>
872  <value>plugins</value>
873  <description>Directories where nutch plugins are located.  Each
874  element may be a relative or absolute path.  If absolute, it is used
875  as is.  If relative, it is searched for on the classpath.</description>
876</property>
877
878<property>
879  <name>plugin.auto-activation</name>
880  <value>true</value>
881  <description>Defines if some plugins that are not activated regarding
882  the plugin.includes and plugin.excludes properties must be automaticaly
883  activated if they are needed by some actived plugins.
884  </description>
885</property>
886
887<property>
888  <name>plugin.includes</name>
889  <value>protocol-http|urlfilter-regex|parse-(text|html|js)|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
890  <description>Regular expression naming plugin directory names to
891  include.  Any plugin not matching this expression is excluded.
892  In any case you need at least include the nutch-extensionpoints plugin. By
893  default Nutch includes crawling just HTML and plain text via HTTP,
894  and basic indexing and search plugins. In order to use HTTPS please enable
895  protocol-httpclient, but be aware of possible intermittent problems with the
896  underlying commons-httpclient library.
897  </description>
898</property>
899
900<property>
901  <name>plugin.excludes</name>
902  <value></value>
903  <description>Regular expression naming plugin directory names to exclude. 
904  </description>
905</property>
906
907<!-- parser properties -->
908
909<property>
910  <name>parse.plugin.file</name>
911  <value>parse-plugins.xml</value>
912  <description>The name of the file that defines the associations between
913  content-types and parsers.</description>
914</property>
915
916<property>
917  <name>parser.character.encoding.default</name>
918  <value>windows-1252</value>
919  <description>The character encoding to fall back to when no other information
920  is available</description>
921</property>
922
923<property>
924  <name>encodingdetector.charset.min.confidence</name>
925  <value>-1</value>
926  <description>A integer between 0-100 indicating minimum confidence value
927  for charset auto-detection. Any negative value disables auto-detection.
928  </description>
929</property>
930
931<property>
932  <name>parser.caching.forbidden.policy</name>
933  <value>content</value>
934  <description>If a site (or a page) requests through its robot metatags
935  that it should not be shown as cached content, apply this policy. Currently
936  three keywords are recognized: "none" ignores any "noarchive" directives.
937  "content" doesn't show the content, but shows summaries (snippets).
938  "all" doesn't show either content or summaries.</description>
939</property>
940
941
942<property>
943  <name>parser.html.impl</name>
944  <value>neko</value>
945  <description>HTML Parser implementation. Currently the following keywords
946  are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup.
947  </description>
948</property>
949
950<property>
951  <name>parser.html.form.use_action</name>
952  <value>false</value>
953  <description>If true, HTML parser will collect URLs from form action
954  attributes. This may lead to undesirable behavior (submitting empty
955  forms during next fetch cycle). If false, form action attribute will
956  be ignored.</description>
957</property>
958
959<property>
960  <name>parser.html.outlinks.ignore_tags</name>
961  <value></value>
962  <description>Comma separated list of HTML tags, from which outlinks
963  shouldn't be extracted. Nutch takes links from: a, area, form, frame,
964  iframe, script, link, img. If you add any of those tags here, it
965  won't be taken. Default is empty list. Probably reasonable value
966  for most people would be "img,script,link".</description>
967</property>
968
969
970<!-- urlfilter plugin properties -->
971
972<property>
973  <name>urlfilter.domain.file</name>
974  <value>domain-urlfilter.txt</value>
975  <description>Name of file on CLASSPATH containing either top level domains or
976  hostnames used by urlfilter-domain (DomainURLFilter) plugin.</description>
977</property>
978
979<property>
980  <name>urlfilter.regex.file</name>
981  <value>regex-urlfilter.txt</value>
982  <description>Name of file on CLASSPATH containing regular expressions
983  used by urlfilter-regex (RegexURLFilter) plugin.</description>
984</property>
985
986<property>
987  <name>urlfilter.automaton.file</name>
988  <value>automaton-urlfilter.txt</value>
989  <description>Name of file on CLASSPATH containing regular expressions
990  used by urlfilter-automaton (AutomatonURLFilter) plugin.</description>
991</property>
992
993<property>
994  <name>urlfilter.prefix.file</name>
995  <value>prefix-urlfilter.txt</value>
996  <description>Name of file on CLASSPATH containing url prefixes
997  used by urlfilter-prefix (PrefixURLFilter) plugin.</description>
998</property>
999
1000<property>
1001  <name>urlfilter.suffix.file</name>
1002  <value>suffix-urlfilter.txt</value>
1003  <description>Name of file on CLASSPATH containing url suffixes
1004  used by urlfilter-suffix (SuffixURLFilter) plugin.</description>
1005</property>
1006
1007<property>
1008  <name>urlfilter.order</name>
1009  <value></value>
1010  <description>The order by which url filters are applied.
1011  If empty, all available url filters (as dictated by properties
1012  plugin-includes and plugin-excludes above) are loaded and applied in system
1013  defined order. If not empty, only named filters are loaded and applied
1014  in given order. For example, if this property has value:
1015  org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter
1016  then RegexURLFilter is applied first, and PrefixURLFilter second.
1017  Since all filters are AND'ed, filter ordering does not have impact
1018  on end result, but it may have performance implication, depending
1019  on relative expensiveness of filters.
1020  </description>
1021</property>
1022
1023<!-- scoring filters properties -->
1024
1025<property>
1026  <name>scoring.filter.order</name>
1027  <value></value>
1028  <description>The order in which scoring filters are applied.
1029  This may be left empty (in which case all available scoring
1030  filters will be applied in the order defined in plugin-includes
1031  and plugin-excludes), or a space separated list of implementation
1032  classes.
1033  </description>
1034</property>
1035
1036<!-- clustering extension properties -->
1037
1038<property>
1039  <name>extension.clustering.hits-to-cluster</name>
1040  <value>100</value>
1041  <description>Number of snippets retrieved for the clustering extension
1042  if clustering extension is available and user requested results
1043  to be clustered.</description>
1044</property>
1045
1046<property>
1047  <name>extension.clustering.extension-name</name>
1048  <value></value>
1049  <description>Use the specified online clustering extension. If empty,
1050  the first available extension will be used. The "name" here refers to an 'id'
1051  attribute of the 'implementation' element in the plugin descriptor XML
1052  file.</description>
1053</property>
1054
1055<!-- ontology extension properties -->
1056
1057<property>
1058  <name>extension.ontology.extension-name</name>
1059  <value></value>
1060  <description>Use the specified online ontology extension. If empty,
1061  the first available extension will be used. The "name" here refers to an 'id'
1062  attribute of the 'implementation' element in the plugin descriptor XML
1063  file.</description>
1064</property>
1065
1066<property>
1067  <name>extension.ontology.urls</name>
1068  <value>
1069  </value>
1070  <description>Urls of owl files, separated by spaces, such as
1071  http://www.example.com/ontology/time.owl
1072  http://www.example.com/ontology/space.owl
1073  http://www.example.com/ontology/wine.owl
1074  Or
1075  file:/ontology/time.owl
1076  file:/ontology/space.owl
1077  file:/ontology/wine.owl
1078  You have to make sure each url is valid.
1079  By default, there is no owl file, so query refinement based on ontology
1080  is silently ignored.
1081  </description>
1082</property>
1083
1084<!-- query-basic plugin properties -->
1085
1086<property>
1087  <name>query.url.boost</name>
1088  <value>4.0</value>
1089  <description> Used as a boost for url field in Lucene query.
1090  </description>
1091</property>
1092
1093<property>
1094  <name>query.anchor.boost</name>
1095  <value>2.0</value>
1096  <description> Used as a boost for anchor field in Lucene query.
1097  </description>
1098</property>
1099
1100<property>
1101  <name>query.title.boost</name>
1102  <value>1.5</value>
1103  <description> Used as a boost for title field in Lucene query.
1104  </description>
1105</property>
1106
1107<property>
1108  <name>query.host.boost</name>
1109  <value>2.0</value>
1110  <description> Used as a boost for host field in Lucene query.
1111  </description>
1112</property>
1113
1114<property>
1115  <name>query.phrase.boost</name>
1116  <value>1.0</value>
1117  <description> Used as a boost for phrase in Lucene query.
1118  Multiplied by boost for field phrase is matched in.
1119  </description>
1120</property>
1121
1122<!--
1123<property>
1124  <name>query.basic.description.boost</name>
1125  <value>1.0</value>
1126  <description> Declares a custom field and its boost to be added to the default fields of the Lucene query.
1127  </description>
1128</property>
1129-->
1130
1131<!-- creative-commons plugin properties -->
1132
1133<property>
1134  <name>query.cc.boost</name>
1135  <value>0.0</value>
1136  <description> Used as a boost for cc field in Lucene query.
1137  </description>
1138</property>
1139
1140<!-- query-more plugin properties -->
1141
1142<property>
1143  <name>query.type.boost</name>
1144  <value>0.0</value>
1145  <description> Used as a boost for type field in Lucene query.
1146  </description>
1147</property>
1148
1149<!-- query-site plugin properties -->
1150
1151<property>
1152  <name>query.site.boost</name>
1153  <value>0.0</value>
1154  <description> Used as a boost for site field in Lucene query.
1155  </description>
1156</property>
1157
1158<!-- microformats-reltag plugin properties -->
1159
1160<property>
1161  <name>query.tag.boost</name>
1162  <value>1.0</value>
1163  <description> Used as a boost for tag field in Lucene query.
1164  </description>
1165</property>
1166
1167<!-- language-identifier plugin properties -->
1168
1169<property>
1170  <name>lang.ngram.min.length</name>
1171  <value>1</value>
1172  <description> The minimum size of ngrams to uses to identify
1173  language (must be between 1 and lang.ngram.max.length).
1174  The larger is the range between lang.ngram.min.length and
1175  lang.ngram.max.length, the better is the identification, but
1176  the slowest it is.
1177  </description>
1178</property>
1179
1180<property>
1181  <name>lang.ngram.max.length</name>
1182  <value>4</value>
1183  <description> The maximum size of ngrams to uses to identify
1184  language (must be between lang.ngram.min.length and 4).
1185  The larger is the range between lang.ngram.min.length and
1186  lang.ngram.max.length, the better is the identification, but
1187  the slowest it is.
1188  </description>
1189</property>
1190
1191<property>
1192  <name>lang.analyze.max.length</name>
1193  <value>2048</value>
1194  <description> The maximum bytes of data to uses to indentify
1195  the language (0 means full content analysis).
1196  The larger is this value, the better is the analysis, but the
1197  slowest it is.
1198  </description>
1199</property>
1200
1201<property>
1202  <name>query.lang.boost</name>
1203  <value>0.0</value>
1204  <description> Used as a boost for lang field in Lucene query.
1205  </description>
1206</property>
1207
1208<!-- Temporary Hadoop 0.17.x workaround. -->
1209
1210<property>
1211  <name>hadoop.job.history.user.location</name>
1212  <value>${hadoop.log.dir}/history/user</value>
1213  <description>Hadoop 0.17.x comes with a default setting to create
1214     user logs inside the output path of the job. This breaks some
1215     Hadoop classes, which expect the output to contain only
1216     part-XXXXX files. This setting changes the output to a
1217     subdirectory of the regular log directory.
1218  </description>
1219</property>
1220
1221<!-- response writer properties -->
1222
1223<property>
1224  <name>search.response.default.type</name>
1225  <value>xml</value>
1226  <description>
1227  The default response type returned if none is specified.
1228  </description>
1229</property>
1230
1231<property>
1232  <name>search.response.default.lang</name>
1233  <value>en</value>
1234  <description>
1235  The default response language if none is specified.
1236  </description>
1237</property>
1238
1239<property>
1240  <name>search.response.default.numrows</name>
1241  <value>10</value>
1242  <description>
1243  The default number of rows to return if none is specified.
1244  </description>
1245</property>
1246
1247<property>
1248  <name>search.response.default.dedupfield</name>
1249  <value>site</value>
1250  <description>
1251  The default dedup field if none is specified.
1252  </description>
1253</property>
1254
1255<property>
1256  <name>search.response.default.numdupes</name>
1257  <value>1</value>
1258  <description>
1259  The default number of duplicates returned if none is specified.
1260  </description>
1261</property>
1262
1263<property>
1264  <name>searcher.response.maxage</name>
1265  <value>86400</value>
1266  <description>
1267  The maxage of a response in seconds. Used in caching headers.
1268  </description>
1269</property>
1270
1271<property>
1272  <name>searcher.response.prettyprint</name>
1273  <value>true</value>
1274  <description>
1275  Should the response output be pretty printed.  Setting to true enables better
1276  debugging, false removes unneeded spaces and gives better throughput.
1277  </description>
1278</property>
1279
1280</configuration>
Note: See TracBrowser for help on using the repository browser.