Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

nutch-default.xml @ 160

Last change on this file since 160 was 66, checked in by waue, 16 years ago
NutchEz - an easy way to nutch
File size: 40.7 KB

Line
1	<?xml version="1.0"?>
2	<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3	<!--
4	Licensed to the Apache Software Foundation (ASF) under one or more
5	contributor license agreements. See the NOTICE file distributed with
6	this work for additional information regarding copyright ownership.
7	The ASF licenses this file to You under the Apache License, Version 2.0
8	(the "License"); you may not use this file except in compliance with
9	the License. You may obtain a copy of the License at
10
11	http://www.apache.org/licenses/LICENSE-2.0
12
13	Unless required by applicable law or agreed to in writing, software
14	distributed under the License is distributed on an "AS IS" BASIS,
15	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16	See the License for the specific language governing permissions and
17	limitations under the License.
18	-->
19	<!-- Do not modify this file directly. Instead, copy entries that you -->
20	<!-- wish to modify from this file into nutch-site.xml and change them -->
21	<!-- there. If nutch-site.xml does not already exist, create it. -->
22
23	<configuration>
24
25	<!-- file properties -->
26
27	<property>
28	<name>file.content.limit</name>
29	<value>65536</value>
30	<description>The length limit for downloaded content, in bytes.
31	If this value is nonnegative (>=0), content longer than it will be truncated;
32	otherwise, no truncation at all.
33	</description>
34	</property>
35
36	<property>
37	<name>file.content.ignored</name>
38	<value>true</value>
39	<description>If true, no file content will be saved during fetch.
40	And it is probably what we want to set most of time, since file:// URLs
41	are meant to be local and we can always use them directly at parsing
42	and indexing stages. Otherwise file contents will be saved.
43	!! NO IMPLEMENTED YET !!
44	</description>
45	</property>
46
47	<!-- HTTP properties -->
48
49	<property>
50	<name>http.agent.name</name>
51	<value></value>
52	<description>HTTP 'User-Agent' request header. MUST NOT be empty -
53	please set this to a single word uniquely related to your organization.
54
55	NOTE: You should also check other related properties:
56
57	http.robots.agents
58	http.agent.description
59	http.agent.url
60	http.agent.email
61	http.agent.version
62
63	and set their values appropriately.
64
65	</description>
66	</property>
67
68	<property>
69	<name>http.robots.agents</name>
70	<value>*</value>
71	<description>The agent strings we'll look for in robots.txt files,
72	comma-separated, in decreasing order of precedence. You should
73	put the value of http.agent.name as the first agent name, and keep the
74	default * at the end of the list. E.g.: BlurflDev,Blurfl,*
75	</description>
76	</property>
77
78	<property>
79	<name>http.robots.403.allow</name>
80	<value>true</value>
81	<description>Some servers return HTTP status 403 (Forbidden) if
82	/robots.txt doesn't exist. This should probably mean that we are
83	allowed to crawl the site nonetheless. If this is set to false,
84	then such sites will be treated as forbidden.</description>
85	</property>
86
87	<property>
88	<name>http.agent.description</name>
89	<value></value>
90	<description>Further description of our bot- this text is used in
91	the User-Agent header. It appears in parenthesis after the agent name.
92	</description>
93	</property>
94
95	<property>
96	<name>http.agent.url</name>
97	<value></value>
98	<description>A URL to advertise in the User-Agent header. This will
99	appear in parenthesis after the agent name. Custom dictates that this
100	should be a URL of a page explaining the purpose and behavior of this
101	crawler.
102	</description>
103	</property>
104
105	<property>
106	<name>http.agent.email</name>
107	<value></value>
108	<description>An email address to advertise in the HTTP 'From' request
109	header and User-Agent header. A good practice is to mangle this
110	address (e.g. 'info at example dot com') to avoid spamming.
111	</description>
112	</property>
113
114	<property>
115	<name>http.agent.version</name>
116	<value>Nutch-1.0</value>
117	<description>A version string to advertise in the User-Agent
118	header.</description>
119	</property>
120
121	<property>
122	<name>http.agent.host</name>
123	<value></value>
124	<description>Name or IP address of the host on which the Nutch crawler
125	would be running. Currently this is used by 'protocol-httpclient'
126	plugin.
127	</description>
128	</property>
129
130	<property>
131	<name>http.timeout</name>
132	<value>10000</value>
133	<description>The default network timeout, in milliseconds.</description>
134	</property>
135
136	<property>
137	<name>http.max.delays</name>
138	<value>100</value>
139	<description>The number of times a thread will delay when trying to
140	fetch a page. Each time it finds that a host is busy, it will wait
141	fetcher.server.delay. After http.max.delays attepts, it will give
142	up on the page for now.</description>
143	</property>
144
145	<property>
146	<name>http.content.limit</name>
147	<value>65536</value>
148	<description>The length limit for downloaded content, in bytes.
149	If this value is nonnegative (>=0), content longer than it will be truncated;
150	otherwise, no truncation at all.
151	</description>
152	</property>
153
154	<property>
155	<name>http.proxy.host</name>
156	<value></value>
157	<description>The proxy hostname. If empty, no proxy is used.</description>
158	</property>
159
160	<property>
161	<name>http.proxy.port</name>
162	<value></value>
163	<description>The proxy port.</description>
164	</property>
165
166	<property>
167	<name>http.proxy.username</name>
168	<value></value>
169	<description>Username for proxy. This will be used by
170	'protocol-httpclient', if the proxy server requests basic, digest
171	and/or NTLM authentication. To use this, 'protocol-httpclient' must
172	be present in the value of 'plugin.includes' property.
173	NOTE: For NTLM authentication, do not prefix the username with the
174	domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect.
175	</description>
176	</property>
177
178	<property>
179	<name>http.proxy.password</name>
180	<value></value>
181	<description>Password for proxy. This will be used by
182	'protocol-httpclient', if the proxy server requests basic, digest
183	and/or NTLM authentication. To use this, 'protocol-httpclient' must
184	be present in the value of 'plugin.includes' property.
185	</description>
186	</property>
187
188	<property>
189	<name>http.proxy.realm</name>
190	<value></value>
191	<description>Authentication realm for proxy. Do not define a value
192	if realm is not required or authentication should take place for any
193	realm. NTLM does not use the notion of realms. Specify the domain name
194	of NTLM authentication as the value for this property. To use this,
195	'protocol-httpclient' must be present in the value of
196	'plugin.includes' property.
197	</description>
198	</property>
199
200	<property>
201	<name>http.auth.file</name>
202	<value>httpclient-auth.xml</value>
203	<description>Authentication configuration file for
204	'protocol-httpclient' plugin.
205	</description>
206	</property>
207
208	<property>
209	<name>http.verbose</name>
210	<value>false</value>
211	<description>If true, HTTP will log more verbosely.</description>
212	</property>
213
214	<property>
215	<name>http.redirect.max</name>
216	<value>0</value>
217	<description>The maximum number of redirects the fetcher will follow when
218	trying to fetch a page. If set to negative or 0, fetcher won't immediately
219	follow redirected URLs, instead it will record them for later fetching.
220	</description>
221	</property>
222
223	<property>
224	<name>http.useHttp11</name>
225	<value>false</value>
226	<description>NOTE: at the moment this works only for protocol-httpclient.
227	If true, use HTTP 1.1, if false use HTTP 1.0 .
228	</description>
229	</property>
230
231	<!-- FTP properties -->
232
233	<property>
234	<name>ftp.username</name>
235	<value>anonymous</value>
236	<description>ftp login username.</description>
237	</property>
238
239	<property>
240	<name>ftp.password</name>
241	<value>anonymous@example.com</value>
242	<description>ftp login password.</description>
243	</property>
244
245	<property>
246	<name>ftp.content.limit</name>
247	<value>65536</value>
248	<description>The length limit for downloaded content, in bytes.
249	If this value is nonnegative (>=0), content longer than it will be truncated;
250	otherwise, no truncation at all.
251	Caution: classical ftp RFCs never defines partial transfer and, in fact,
252	some ftp servers out there do not handle client side forced close-down very
253	well. Our implementation tries its best to handle such situations smoothly.
254	</description>
255	</property>
256
257	<property>
258	<name>ftp.timeout</name>
259	<value>60000</value>
260	<description>Default timeout for ftp client socket, in millisec.
261	Please also see ftp.keep.connection below.</description>
262	</property>
263
264	<property>
265	<name>ftp.server.timeout</name>
266	<value>100000</value>
267	<description>An estimation of ftp server idle time, in millisec.
268	Typically it is 120000 millisec for many ftp servers out there.
269	Better be conservative here. Together with ftp.timeout, it is used to
270	decide if we need to delete (annihilate) current ftp.client instance and
271	force to start another ftp.client instance anew. This is necessary because
272	a fetcher thread may not be able to obtain next request from queue in time
273	(due to idleness) before our ftp client times out or remote server
274	disconnects. Used only when ftp.keep.connection is true (please see below).
275	</description>
276	</property>
277
278	<property>
279	<name>ftp.keep.connection</name>
280	<value>false</value>
281	<description>Whether to keep ftp connection. Useful if crawling same host
282	again and again. When set to true, it avoids connection, login and dir list
283	parser setup for subsequent urls. If it is set to true, however, you must
284	make sure (roughly):
285	(1) ftp.timeout is less than ftp.server.timeout
286	(2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay)
287	Otherwise there will be too many "delete client because idled too long"
288	messages in thread logs.</description>
289	</property>
290
291	<property>
292	<name>ftp.follow.talk</name>
293	<value>false</value>
294	<description>Whether to log dialogue between our client and remote
295	server. Useful for debugging.</description>
296	</property>
297
298	<!-- web db properties -->
299
300	<property>
301	<name>db.default.fetch.interval</name>
302	<value>30</value>
303	<description>(DEPRECATED) The default number of days between re-fetches of a page.
304	</description>
305	</property>
306
307	<property>
308	<name>db.fetch.interval.default</name>
309	<value>2592000</value>
310	<description>The default number of seconds between re-fetches of a page (30 days).
311	</description>
312	</property>
313
314	<property>
315	<name>db.fetch.interval.max</name>
316	<value>7776000</value>
317	<description>The maximum number of seconds between re-fetches of a page
318	(90 days). After this period every page in the db will be re-tried, no
319	matter what is its status.
320	</description>
321	</property>
322
323	<property>
324	<name>db.fetch.schedule.class</name>
325	<value>org.apache.nutch.crawl.DefaultFetchSchedule</value>
326	<description>The implementation of fetch schedule. DefaultFetchSchedule simply
327	adds the original fetchInterval to the last fetch time, regardless of
328	page changes.</description>
329	</property>
330
331	<property>
332	<name>db.fetch.schedule.adaptive.inc_rate</name>
333	<value>0.4</value>
334	<description>If a page is unmodified, its fetchInterval will be
335	increased by this rate. This value should not
336	exceed 0.5, otherwise the algorithm becomes unstable.</description>
337	</property>
338
339	<property>
340	<name>db.fetch.schedule.adaptive.dec_rate</name>
341	<value>0.2</value>
342	<description>If a page is modified, its fetchInterval will be
343	decreased by this rate. This value should not
344	exceed 0.5, otherwise the algorithm becomes unstable.</description>
345	</property>
346
347	<property>
348	<name>db.fetch.schedule.adaptive.min_interval</name>
349	<value>60.0</value>
350	<description>Minimum fetchInterval, in seconds.</description>
351	</property>
352
353	<property>
354	<name>db.fetch.schedule.adaptive.max_interval</name>
355	<value>31536000.0</value>
356	<description>Maximum fetchInterval, in seconds (365 days).
357	NOTE: this is limited by db.fetch.interval.max. Pages with
358	fetchInterval larger than db.fetch.interval.max
359	will be fetched anyway.</description>
360	</property>
361
362	<property>
363	<name>db.fetch.schedule.adaptive.sync_delta</name>
364	<value>true</value>
365	<description>If true, try to synchronize with the time of page change.
366	by shifting the next fetchTime by a fraction (sync_rate) of the difference
367	between the last modification time, and the last fetch time.</description>
368	</property>
369
370	<property>
371	<name>db.fetch.schedule.adaptive.sync_delta_rate</name>
372	<value>0.3</value>
373	<description>See sync_delta for description. This value should not
374	exceed 0.5, otherwise the algorithm becomes unstable.</description>
375	</property>
376
377	<property>
378	<name>db.update.additions.allowed</name>
379	<value>true</value>
380	<description>If true, updatedb will add newly discovered URLs, if false
381	only already existing URLs in the CrawlDb will be updated and no new
382	URLs will be added.
383	</description>
384	</property>
385
386	<property>
387	<name>db.ignore.internal.links</name>
388	<value>true</value>
389	<description>If true, when adding new links to a page, links from
390	the same host are ignored. This is an effective way to limit the
391	size of the link database, keeping only the highest quality
392	links.
393	</description>
394	</property>
395
396	<property>
397	<name>db.ignore.external.links</name>
398	<value>false</value>
399	<description>If true, outlinks leading from a page to external hosts
400	will be ignored. This is an effective way to limit the crawl to include
401	only initially injected hosts, without creating complex URLFilters.
402	</description>
403	</property>
404
405	<property>
406	<name>db.score.injected</name>
407	<value>1.0</value>
408	<description>The score of new pages added by the injector.
409	</description>
410	</property>
411
412	<property>
413	<name>db.score.link.external</name>
414	<value>1.0</value>
415	<description>The score factor for new pages added due to a link from
416	another host relative to the referencing page's score. Scoring plugins
417	may use this value to affect initial scores of external links.
418	</description>
419	</property>
420
421	<property>
422	<name>db.score.link.internal</name>
423	<value>1.0</value>
424	<description>The score factor for pages added due to a link from the
425	same host, relative to the referencing page's score. Scoring plugins
426	may use this value to affect initial scores of internal links.
427	</description>
428	</property>
429
430	<property>
431	<name>db.score.count.filtered</name>
432	<value>false</value>
433	<description>The score value passed to newly discovered pages is
434	calculated as a fraction of the original page score divided by the
435	number of outlinks. If this option is false, only the outlinks that passed
436	URLFilters will count, if it's true then all outlinks will count.
437	</description>
438	</property>
439
440	<property>
441	<name>db.max.inlinks</name>
442	<value>10000</value>
443	<description>Maximum number of Inlinks per URL to be kept in LinkDb.
444	If "invertlinks" finds more inlinks than this number, only the first
445	N inlinks will be stored, and the rest will be discarded.
446	</description>
447	</property>
448
449	<property>
450	<name>db.max.outlinks.per.page</name>
451	<value>100</value>
452	<description>The maximum number of outlinks that we'll process for a page.
453	If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks
454	will be processed for a page; otherwise, all outlinks will be processed.
455	</description>
456	</property>
457
458	<property>
459	<name>db.max.anchor.length</name>
460	<value>100</value>
461	<description>The maximum number of characters permitted in an anchor.
462	</description>
463	</property>
464
465	<property>
466	<name>db.fetch.retry.max</name>
467	<value>3</value>
468	<description>The maximum number of times a url that has encountered
469	recoverable errors is generated for fetch.</description>
470	</property>
471
472	<property>
473	<name>db.signature.class</name>
474	<value>org.apache.nutch.crawl.MD5Signature</value>
475	<description>The default implementation of a page signature. Signatures
476	created with this implementation will be used for duplicate detection
477	and removal.</description>
478	</property>
479
480	<property>
481	<name>db.signature.text_profile.min_token_len</name>
482	<value>2</value>
483	<description>Minimum token length to be included in the signature.
484	</description>
485	</property>
486
487	<property>
488	<name>db.signature.text_profile.quant_rate</name>
489	<value>0.01</value>
490	<description>Profile frequencies will be rounded down to a multiple of
491	QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token
492	frequency. If maxFreq > 1 then QUANT will be at least 2, which means that
493	for longer texts tokens with frequency 1 will always be discarded.
494	</description>
495	</property>
496
497	<!-- generate properties -->
498
499	<property>
500	<name>generate.max.per.host</name>
501	<value>-1</value>
502	<description>The maximum number of urls per host in a single
503	fetchlist. -1 if unlimited.</description>
504	</property>
505
506	<property>
507	<name>generate.max.per.host.by.ip</name>
508	<value>false</value>
509	<description>If false, same host names are counted. If true,
510	hosts' IP addresses are resolved and the same IP-s are counted.
511
512	-+-+-+- WARNING !!! -+-+-+-
513	When set to true, Generator will create a lot of DNS lookup
514	requests, rapidly. This may cause a DOS attack on
515	remote DNS servers, not to mention increased external traffic
516	and latency. For these reasons when using this option it is
517	required that a local caching DNS be used.</description>
518	</property>
519
520	<property>
521	<name>generate.update.crawldb</name>
522	<value>false</value>
523	<description>For highly-concurrent environments, where several
524	generate/fetch/update cycles may overlap, setting this to true ensures
525	that generate will create different fetchlists even without intervening
526	updatedb-s, at the cost of running an additional job to update CrawlDB.
527	If false, running generate twice without intervening
528	updatedb will generate identical fetchlists.</description>
529	</property>
530
531	<!-- fetcher properties -->
532
533	<property>
534	<name>fetcher.server.delay</name>
535	<value>5.0</value>
536	<description>The number of seconds the fetcher will delay between
537	successive requests to the same server.</description>
538	</property>
539
540	<property>
541	<name>fetcher.server.min.delay</name>
542	<value>0.0</value>
543	<description>The minimum number of seconds the fetcher will delay between
544	successive requests to the same server. This value is applicable ONLY
545	if fetcher.threads.per.host is greater than 1 (i.e. the host blocking
546	is turned off).</description>
547	</property>
548
549	<property>
550	<name>fetcher.max.crawl.delay</name>
551	<value>30</value>
552	<description>
553	If the Crawl-Delay in robots.txt is set to greater than this value (in
554	seconds) then the fetcher will skip this page, generating an error report.
555	If set to -1 the fetcher will never skip such pages and will wait the
556	amount of time retrieved from robots.txt Crawl-Delay, however long that
557	might be.
558	</description>
559	</property>
560
561	<property>
562	<name>fetcher.threads.fetch</name>
563	<value>10</value>
564	<description>The number of FetcherThreads the fetcher should use.
565	This is also determines the maximum number of requests that are
566	made at once (each FetcherThread handles one connection).</description>
567	</property>
568
569	<property>
570	<name>fetcher.threads.per.host</name>
571	<value>1</value>
572	<description>This number is the maximum number of threads that
573	should be allowed to access a host at one time.</description>
574	</property>
575
576	<property>
577	<name>fetcher.threads.per.host.by.ip</name>
578	<value>true</value>
579	<description>If true, then fetcher will count threads by IP address,
580	to which the URL's host name resolves. If false, only host name will be
581	used. NOTE: this should be set to the same value as
582	"generate.max.per.host.by.ip" - default settings are different only for
583	reasons of backward-compatibility.</description>
584	</property>
585
586	<property>
587	<name>fetcher.verbose</name>
588	<value>false</value>
589	<description>If true, fetcher will log more verbosely.</description>
590	</property>
591
592	<property>
593	<name>fetcher.parse</name>
594	<value>true</value>
595	<description>If true, fetcher will parse content.</description>
596	</property>
597
598	<property>
599	<name>fetcher.store.content</name>
600	<value>true</value>
601	<description>If true, fetcher will store content.</description>
602	</property>
603
604	<!-- indexer properties -->
605
606	<property>
607	<name>indexer.score.power</name>
608	<value>0.5</value>
609	<description>Determines the power of link analyis scores. Each
610	pages's boost is set to <i>score<sup>scorePower</sup></i> where
611	<i>score</i> is its link analysis score and <i>scorePower</i> is the
612	value of this parameter. This is compiled into indexes, so, when
613	this is changed, pages must be re-indexed for it to take
614	effect.</description>
615	</property>
616
617	<property>
618	<name>indexer.max.title.length</name>
619	<value>100</value>
620	<description>The maximum number of characters of a title that are indexed.
621	</description>
622	</property>
623
624	<property>
625	<name>indexer.max.tokens</name>
626	<value>10000</value>
627	<description>
628	The maximum number of tokens that will be indexed for a single field
629	in a document. This limits the amount of memory required for
630	indexing, so that collections with very large files will not crash
631	the indexing process by running out of memory.
632
633	Note that this effectively truncates large documents, excluding
634	from the index tokens that occur further in the document. If you
635	know your source documents are large, be sure to set this value
636	high enough to accomodate the expected size. If you set it to
637	-1, then the only limit is your memory, but you should anticipate
638	an OutOfMemoryError.
639	</description>
640	</property>
641
642	<property>
643	<name>indexer.mergeFactor</name>
644	<value>50</value>
645	<description>The factor that determines the frequency of Lucene segment
646	merges. This must not be less than 2, higher values increase indexing
647	speed but lead to increased RAM usage, and increase the number of
648	open file handles (which may lead to "Too many open files" errors).
649	NOTE: the "segments" here have nothing to do with Nutch segments, they
650	are a low-level data unit used by Lucene.
651	</description>
652	</property>
653
654	<property>
655	<name>indexer.minMergeDocs</name>
656	<value>50</value>
657	<description>This number determines the minimum number of Lucene
658	Documents buffered in memory between Lucene segment merges. Larger
659	values increase indexing speed and increase RAM usage.
660	</description>
661	</property>
662
663	<property>
664	<name>indexer.maxMergeDocs</name>
665	<value>2147483647</value>
666	<description>This number determines the maximum number of Lucene
667	Documents to be merged into a new Lucene segment. Larger values
668	increase batch indexing speed and reduce the number of Lucene segments,
669	which reduces the number of open file handles; however, this also
670	decreases incremental indexing performance.
671	</description>
672	</property>
673
674	<property>
675	<name>indexer.termIndexInterval</name>
676	<value>128</value>
677	<description>Determines the fraction of terms which Lucene keeps in
678	RAM when searching, to facilitate random-access. Smaller values use
679	more memory but make searches somewhat faster. Larger values use
680	less memory but make searches somewhat slower.
681	</description>
682	</property>
683
684	<!-- indexingfilter plugin properties -->
685
686	<property>
687	<name>indexingfilter.order</name>
688	<value></value>
689	<description>The order by which index filters are applied.
690	If empty, all available index filters (as dictated by properties
691	plugin-includes and plugin-excludes above) are loaded and applied in system
692	defined order. If not empty, only named filters are loaded and applied
693	in given order. For example, if this property has value:
694	org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter
695	then BasicIndexingFilter is applied first, and MoreIndexingFilter second.
696
697	Filter ordering might have impact on result if one filter depends on output of
698	another filter.
699	</description>
700	</property>
701
702
703	<!-- analysis properties -->
704
705	<property>
706	<name>analysis.common.terms.file</name>
707	<value>common-terms.utf8</value>
708	<description>The name of a file containing a list of common terms
709	that should be indexed in n-grams.</description>
710	</property>
711
712	<!-- searcher properties -->
713
714	<property>
715	<name>searcher.dir</name>
716	<value>crawl</value>
717	<description>
718	Path to root of crawl. This directory is searched (in
719	order) for either the file search-servers.txt, containing a list of
720	distributed search servers, or the directory "index" containing
721	merged indexes, or the directory "segments" containing segment
722	indexes.
723	</description>
724	</property>
725
726	<property>
727	<name>searcher.filter.cache.size</name>
728	<value>16</value>
729	<description>
730	Maximum number of filters to cache. Filters can accelerate certain
731	field-based queries, like language, document format, etc. Each
732	filter requires one bit of RAM per page. So, with a 10 million page
733	index, a cache size of 16 consumes two bytes per page, or 20MB.
734	</description>
735	</property>
736
737	<property>
738	<name>searcher.filter.cache.threshold</name>
739	<value>0.05</value>
740	<description>
741	Filters are cached when their term is matched by more than this
742	fraction of pages. For example, with a threshold of 0.05, and 10
743	million pages, the term must match more than 1/20, or 50,000 pages.
744	So, if out of 10 million pages, 50% of pages are in English, and 2%
745	are in Finnish, then, with a threshold of 0.05, searches for
746	"lang:en" will use a cached filter, while searches for "lang:fi"
747	will score all 20,000 finnish documents.
748	</description>
749	</property>
750
751	<property>
752	<name>searcher.hostgrouping.rawhits.factor</name>
753	<value>2.0</value>
754	<description>
755	A factor that is used to determine the number of raw hits
756	initially fetched, before host grouping is done.
757	</description>
758	</property>
759
760	<property>
761	<name>searcher.summary.context</name>
762	<value>5</value>
763	<description>
764	The number of context terms to display preceding and following
765	matching terms in a hit summary.
766	</description>
767	</property>
768
769	<property>
770	<name>searcher.summary.length</name>
771	<value>20</value>
772	<description>
773	The total number of terms to display in a hit summary.
774	</description>
775	</property>
776
777	<property>
778	<name>searcher.max.hits</name>
779	<value>-1</value>
780	<description>If positive, search stops after this many hits are
781	found. Setting this to small, positive values (e.g., 1000) can make
782	searches much faster. With a sorted index, the quality of the hits
783	suffers little.</description>
784	</property>
785
786	<property>
787	<name>searcher.max.time.tick_count</name>
788	<value>-1</value>
789	<description>If positive value is defined here, limit search time for
790	every request to this number of elapsed ticks (see the tick_length
791	property below). The total maximum time for any search request will be
792	then limited to tick_count * tick_length milliseconds. When search time
793	is exceeded, partial results will be returned, and the total number of
794	hits will be estimated.
795	</description>
796	</property>
797
798	<property>
799	<name>searcher.max.time.tick_length</name>
800	<value>200</value>
801	<description>The number of milliseconds between ticks. Larger values
802	reduce the timer granularity (precision). Smaller values bring more
803	overhead.
804	</description>
805	</property>
806
807	<property>
808	<name>searcher.num.handlers</name>
809	<value>10</value>
810	<description>The number of handlers for the distributed search server.
811	</description>
812	</property>
813
814	<property>
815	<name>searcher.max.hits.per.page</name>
816	<value>1000</value>
817	<description> The maximum number of hits to show per page. -1 if
818	unlimited. If the number of hits requested by the user (via
819	hitsPerPage parameter in the query string) is more than the value
820	specified in this property, then this value is assumed as the number
821	of hits per page.
822	</description>
823	</property>
824
825	<!-- URL normalizer properties -->
826
827	<property>
828	<name>urlnormalizer.order</name>
829	<value>org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer</value>
830	<description>Order in which normalizers will run. If any of these isn't
831	activated it will be silently skipped. If other normalizers not on the
832	list are activated, they will run in random order after the ones
833	specified here are run.
834	</description>
835	</property>
836
837	<property>
838	<name>urlnormalizer.regex.file</name>
839	<value>regex-normalize.xml</value>
840	<description>Name of the config file used by the RegexUrlNormalizer class.
841	</description>
842	</property>
843
844	<property>
845	<name>urlnormalizer.loop.count</name>
846	<value>1</value>
847	<description>Optionally loop through normalizers several times, to make
848	sure that all transformations have been performed.
849	</description>
850	</property>
851
852	<!-- mime properties -->
853
854	<property>
855	<name>mime.types.file</name>
856	<value>tika-mimetypes.xml</value>
857	<description>Name of file in CLASSPATH containing filename extension and
858	magic sequence to mime types mapping information</description>
859	</property>
860
861	<property>
862	<name>mime.type.magic</name>
863	<value>true</value>
864	<description>Defines if the mime content type detector uses magic resolution.
865	</description>
866	</property>
867
868	<!-- plugin properties -->
869
870	<property>
871	<name>plugin.folders</name>
872	<value>plugins</value>
873	<description>Directories where nutch plugins are located. Each
874	element may be a relative or absolute path. If absolute, it is used
875	as is. If relative, it is searched for on the classpath.</description>
876	</property>
877
878	<property>
879	<name>plugin.auto-activation</name>
880	<value>true</value>
881	<description>Defines if some plugins that are not activated regarding
882	the plugin.includes and plugin.excludes properties must be automaticaly
883	activated if they are needed by some actived plugins.
884	</description>
885	</property>
886
887	<property>
888	<name>plugin.includes</name>
889	<value>protocol-http\|urlfilter-regex\|parse-(text\|html\|js)\|index-(basic\|anchor)\|query-(basic\|site\|url)\|response-(json\|xml)\|summary-basic\|scoring-opic\|urlnormalizer-(pass\|regex\|basic)</value>
890	<description>Regular expression naming plugin directory names to
891	include. Any plugin not matching this expression is excluded.
892	In any case you need at least include the nutch-extensionpoints plugin. By
893	default Nutch includes crawling just HTML and plain text via HTTP,
894	and basic indexing and search plugins. In order to use HTTPS please enable
895	protocol-httpclient, but be aware of possible intermittent problems with the
896	underlying commons-httpclient library.
897	</description>
898	</property>
899
900	<property>
901	<name>plugin.excludes</name>
902	<value></value>
903	<description>Regular expression naming plugin directory names to exclude.
904	</description>
905	</property>
906
907	<!-- parser properties -->
908
909	<property>
910	<name>parse.plugin.file</name>
911	<value>parse-plugins.xml</value>
912	<description>The name of the file that defines the associations between
913	content-types and parsers.</description>
914	</property>
915
916	<property>
917	<name>parser.character.encoding.default</name>
918	<value>windows-1252</value>
919	<description>The character encoding to fall back to when no other information
920	is available</description>
921	</property>
922
923	<property>
924	<name>encodingdetector.charset.min.confidence</name>
925	<value>-1</value>
926	<description>A integer between 0-100 indicating minimum confidence value
927	for charset auto-detection. Any negative value disables auto-detection.
928	</description>
929	</property>
930
931	<property>
932	<name>parser.caching.forbidden.policy</name>
933	<value>content</value>
934	<description>If a site (or a page) requests through its robot metatags
935	that it should not be shown as cached content, apply this policy. Currently
936	three keywords are recognized: "none" ignores any "noarchive" directives.
937	"content" doesn't show the content, but shows summaries (snippets).
938	"all" doesn't show either content or summaries.</description>
939	</property>
940
941
942	<property>
943	<name>parser.html.impl</name>
944	<value>neko</value>
945	<description>HTML Parser implementation. Currently the following keywords
946	are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup.
947	</description>
948	</property>
949
950	<property>
951	<name>parser.html.form.use_action</name>
952	<value>false</value>
953	<description>If true, HTML parser will collect URLs from form action
954	attributes. This may lead to undesirable behavior (submitting empty
955	forms during next fetch cycle). If false, form action attribute will
956	be ignored.</description>
957	</property>
958
959	<property>
960	<name>parser.html.outlinks.ignore_tags</name>
961	<value></value>
962	<description>Comma separated list of HTML tags, from which outlinks
963	shouldn't be extracted. Nutch takes links from: a, area, form, frame,
964	iframe, script, link, img. If you add any of those tags here, it
965	won't be taken. Default is empty list. Probably reasonable value
966	for most people would be "img,script,link".</description>
967	</property>
968
969
970	<!-- urlfilter plugin properties -->
971
972	<property>
973	<name>urlfilter.domain.file</name>
974	<value>domain-urlfilter.txt</value>
975	<description>Name of file on CLASSPATH containing either top level domains or
976	hostnames used by urlfilter-domain (DomainURLFilter) plugin.</description>
977	</property>
978
979	<property>
980	<name>urlfilter.regex.file</name>
981	<value>regex-urlfilter.txt</value>
982	<description>Name of file on CLASSPATH containing regular expressions
983	used by urlfilter-regex (RegexURLFilter) plugin.</description>
984	</property>
985
986	<property>
987	<name>urlfilter.automaton.file</name>
988	<value>automaton-urlfilter.txt</value>
989	<description>Name of file on CLASSPATH containing regular expressions
990	used by urlfilter-automaton (AutomatonURLFilter) plugin.</description>
991	</property>
992
993	<property>
994	<name>urlfilter.prefix.file</name>
995	<value>prefix-urlfilter.txt</value>
996	<description>Name of file on CLASSPATH containing url prefixes
997	used by urlfilter-prefix (PrefixURLFilter) plugin.</description>
998	</property>
999
1000	<property>
1001	<name>urlfilter.suffix.file</name>
1002	<value>suffix-urlfilter.txt</value>
1003	<description>Name of file on CLASSPATH containing url suffixes
1004	used by urlfilter-suffix (SuffixURLFilter) plugin.</description>
1005	</property>
1006
1007	<property>
1008	<name>urlfilter.order</name>
1009	<value></value>
1010	<description>The order by which url filters are applied.
1011	If empty, all available url filters (as dictated by properties
1012	plugin-includes and plugin-excludes above) are loaded and applied in system
1013	defined order. If not empty, only named filters are loaded and applied
1014	in given order. For example, if this property has value:
1015	org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter
1016	then RegexURLFilter is applied first, and PrefixURLFilter second.
1017	Since all filters are AND'ed, filter ordering does not have impact
1018	on end result, but it may have performance implication, depending
1019	on relative expensiveness of filters.
1020	</description>
1021	</property>
1022
1023	<!-- scoring filters properties -->
1024
1025	<property>
1026	<name>scoring.filter.order</name>
1027	<value></value>
1028	<description>The order in which scoring filters are applied.
1029	This may be left empty (in which case all available scoring
1030	filters will be applied in the order defined in plugin-includes
1031	and plugin-excludes), or a space separated list of implementation
1032	classes.
1033	</description>
1034	</property>
1035
1036	<!-- clustering extension properties -->
1037
1038	<property>
1039	<name>extension.clustering.hits-to-cluster</name>
1040	<value>100</value>
1041	<description>Number of snippets retrieved for the clustering extension
1042	if clustering extension is available and user requested results
1043	to be clustered.</description>
1044	</property>
1045
1046	<property>
1047	<name>extension.clustering.extension-name</name>
1048	<value></value>
1049	<description>Use the specified online clustering extension. If empty,
1050	the first available extension will be used. The "name" here refers to an 'id'
1051	attribute of the 'implementation' element in the plugin descriptor XML
1052	file.</description>
1053	</property>
1054
1055	<!-- ontology extension properties -->
1056
1057	<property>
1058	<name>extension.ontology.extension-name</name>
1059	<value></value>
1060	<description>Use the specified online ontology extension. If empty,
1061	the first available extension will be used. The "name" here refers to an 'id'
1062	attribute of the 'implementation' element in the plugin descriptor XML
1063	file.</description>
1064	</property>
1065
1066	<property>
1067	<name>extension.ontology.urls</name>
1068	<value>
1069	</value>
1070	<description>Urls of owl files, separated by spaces, such as
1071	http://www.example.com/ontology/time.owl
1072	http://www.example.com/ontology/space.owl
1073	http://www.example.com/ontology/wine.owl
1074	Or
1075	file:/ontology/time.owl
1076	file:/ontology/space.owl
1077	file:/ontology/wine.owl
1078	You have to make sure each url is valid.
1079	By default, there is no owl file, so query refinement based on ontology
1080	is silently ignored.
1081	</description>
1082	</property>
1083
1084	<!-- query-basic plugin properties -->
1085
1086	<property>
1087	<name>query.url.boost</name>
1088	<value>4.0</value>
1089	<description> Used as a boost for url field in Lucene query.
1090	</description>
1091	</property>
1092
1093	<property>
1094	<name>query.anchor.boost</name>
1095	<value>2.0</value>
1096	<description> Used as a boost for anchor field in Lucene query.
1097	</description>
1098	</property>
1099
1100	<property>
1101	<name>query.title.boost</name>
1102	<value>1.5</value>
1103	<description> Used as a boost for title field in Lucene query.
1104	</description>
1105	</property>
1106
1107	<property>
1108	<name>query.host.boost</name>
1109	<value>2.0</value>
1110	<description> Used as a boost for host field in Lucene query.
1111	</description>
1112	</property>
1113
1114	<property>
1115	<name>query.phrase.boost</name>
1116	<value>1.0</value>
1117	<description> Used as a boost for phrase in Lucene query.
1118	Multiplied by boost for field phrase is matched in.
1119	</description>
1120	</property>
1121
1122	<!--
1123	<property>
1124	<name>query.basic.description.boost</name>
1125	<value>1.0</value>
1126	<description> Declares a custom field and its boost to be added to the default fields of the Lucene query.
1127	</description>
1128	</property>
1129	-->
1130
1131	<!-- creative-commons plugin properties -->
1132
1133	<property>
1134	<name>query.cc.boost</name>
1135	<value>0.0</value>
1136	<description> Used as a boost for cc field in Lucene query.
1137	</description>
1138	</property>
1139
1140	<!-- query-more plugin properties -->
1141
1142	<property>
1143	<name>query.type.boost</name>
1144	<value>0.0</value>
1145	<description> Used as a boost for type field in Lucene query.
1146	</description>
1147	</property>
1148
1149	<!-- query-site plugin properties -->
1150
1151	<property>
1152	<name>query.site.boost</name>
1153	<value>0.0</value>
1154	<description> Used as a boost for site field in Lucene query.
1155	</description>
1156	</property>
1157
1158	<!-- microformats-reltag plugin properties -->
1159
1160	<property>
1161	<name>query.tag.boost</name>
1162	<value>1.0</value>
1163	<description> Used as a boost for tag field in Lucene query.
1164	</description>
1165	</property>
1166
1167	<!-- language-identifier plugin properties -->
1168
1169	<property>
1170	<name>lang.ngram.min.length</name>
1171	<value>1</value>
1172	<description> The minimum size of ngrams to uses to identify
1173	language (must be between 1 and lang.ngram.max.length).
1174	The larger is the range between lang.ngram.min.length and
1175	lang.ngram.max.length, the better is the identification, but
1176	the slowest it is.
1177	</description>
1178	</property>
1179
1180	<property>
1181	<name>lang.ngram.max.length</name>
1182	<value>4</value>
1183	<description> The maximum size of ngrams to uses to identify
1184	language (must be between lang.ngram.min.length and 4).
1185	The larger is the range between lang.ngram.min.length and
1186	lang.ngram.max.length, the better is the identification, but
1187	the slowest it is.
1188	</description>
1189	</property>
1190
1191	<property>
1192	<name>lang.analyze.max.length</name>
1193	<value>2048</value>
1194	<description> The maximum bytes of data to uses to indentify
1195	the language (0 means full content analysis).
1196	The larger is this value, the better is the analysis, but the
1197	slowest it is.
1198	</description>
1199	</property>
1200
1201	<property>
1202	<name>query.lang.boost</name>
1203	<value>0.0</value>
1204	<description> Used as a boost for lang field in Lucene query.
1205	</description>
1206	</property>
1207
1208	<!-- Temporary Hadoop 0.17.x workaround. -->
1209
1210	<property>
1211	<name>hadoop.job.history.user.location</name>
1212	<value>${hadoop.log.dir}/history/user</value>
1213	<description>Hadoop 0.17.x comes with a default setting to create
1214	user logs inside the output path of the job. This breaks some
1215	Hadoop classes, which expect the output to contain only
1216	part-XXXXX files. This setting changes the output to a
1217	subdirectory of the regular log directory.
1218	</description>
1219	</property>
1220
1221	<!-- response writer properties -->
1222
1223	<property>
1224	<name>search.response.default.type</name>
1225	<value>xml</value>
1226	<description>
1227	The default response type returned if none is specified.
1228	</description>
1229	</property>
1230
1231	<property>
1232	<name>search.response.default.lang</name>
1233	<value>en</value>
1234	<description>
1235	The default response language if none is specified.
1236	</description>
1237	</property>
1238
1239	<property>
1240	<name>search.response.default.numrows</name>
1241	<value>10</value>
1242	<description>
1243	The default number of rows to return if none is specified.
1244	</description>
1245	</property>
1246
1247	<property>
1248	<name>search.response.default.dedupfield</name>
1249	<value>site</value>
1250	<description>
1251	The default dedup field if none is specified.
1252	</description>
1253	</property>
1254
1255	<property>
1256	<name>search.response.default.numdupes</name>
1257	<value>1</value>
1258	<description>
1259	The default number of duplicates returned if none is specified.
1260	</description>
1261	</property>
1262
1263	<property>
1264	<name>searcher.response.maxage</name>
1265	<value>86400</value>
1266	<description>
1267	The maxage of a response in seconds. Used in caching headers.
1268	</description>
1269	</property>
1270
1271	<property>
1272	<name>searcher.response.prettyprint</name>
1273	<value>true</value>
1274	<description>
1275	Should the response output be pretty printed. Setting to true enables better
1276	debugging, false removes unneeded spaces and gives better throughput.
1277	</description>
1278	</property>
1279
1280	</configuration>

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: nutchez-0.1/tomcat/webapps/ROOT/WEB-INF/classes/nutch-default.xml @ 160

Download in other formats: