Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

nutch-default.xml @ 67

Last change on this file since 67 was 66, checked in by waue, 15 years ago
NutchEz - an easy way to nutch
Property svn:executable set to ``*
File size: 40.7 KB

Rev	Line
[66]	1	<?xml version="1.0"?>
	2	<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
	3	<!--
	4	Licensed to the Apache Software Foundation (ASF) under one or more
	5	contributor license agreements. See the NOTICE file distributed with
	6	this work for additional information regarding copyright ownership.
	7	The ASF licenses this file to You under the Apache License, Version 2.0
	8	(the "License"); you may not use this file except in compliance with
	9	the License. You may obtain a copy of the License at
	10
	11	http://www.apache.org/licenses/LICENSE-2.0
	12
	13	Unless required by applicable law or agreed to in writing, software
	14	distributed under the License is distributed on an "AS IS" BASIS,
	15	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	16	See the License for the specific language governing permissions and
	17	limitations under the License.
	18	-->
	19	<!-- Do not modify this file directly. Instead, copy entries that you -->
	20	<!-- wish to modify from this file into nutch-site.xml and change them -->
	21	<!-- there. If nutch-site.xml does not already exist, create it. -->
	22
	23	<configuration>
	24
	25	<!-- file properties -->
	26
	27	<property>
	28	<name>file.content.limit</name>
	29	<value>65536</value>
	30	<description>The length limit for downloaded content, in bytes.
	31	If this value is nonnegative (>=0), content longer than it will be truncated;
	32	otherwise, no truncation at all.
	33	</description>
	34	</property>
	35
	36	<property>
	37	<name>file.content.ignored</name>
	38	<value>true</value>
	39	<description>If true, no file content will be saved during fetch.
	40	And it is probably what we want to set most of time, since file:// URLs
	41	are meant to be local and we can always use them directly at parsing
	42	and indexing stages. Otherwise file contents will be saved.
	43	!! NO IMPLEMENTED YET !!
	44	</description>
	45	</property>
	46
	47	<!-- HTTP properties -->
	48
	49	<property>
	50	<name>http.agent.name</name>
	51	<value></value>
	52	<description>HTTP 'User-Agent' request header. MUST NOT be empty -
	53	please set this to a single word uniquely related to your organization.
	54
	55	NOTE: You should also check other related properties:
	56
	57	http.robots.agents
	58	http.agent.description
	59	http.agent.url
	60	http.agent.email
	61	http.agent.version
	62
	63	and set their values appropriately.
	64
	65	</description>
	66	</property>
	67
	68	<property>
	69	<name>http.robots.agents</name>
	70	<value>*</value>
	71	<description>The agent strings we'll look for in robots.txt files,
	72	comma-separated, in decreasing order of precedence. You should
	73	put the value of http.agent.name as the first agent name, and keep the
	74	default * at the end of the list. E.g.: BlurflDev,Blurfl,*
	75	</description>
	76	</property>
	77
	78	<property>
	79	<name>http.robots.403.allow</name>
	80	<value>true</value>
	81	<description>Some servers return HTTP status 403 (Forbidden) if
	82	/robots.txt doesn't exist. This should probably mean that we are
	83	allowed to crawl the site nonetheless. If this is set to false,
	84	then such sites will be treated as forbidden.</description>
	85	</property>
	86
	87	<property>
	88	<name>http.agent.description</name>
	89	<value></value>
	90	<description>Further description of our bot- this text is used in
	91	the User-Agent header. It appears in parenthesis after the agent name.
	92	</description>
	93	</property>
	94
	95	<property>
	96	<name>http.agent.url</name>
	97	<value></value>
	98	<description>A URL to advertise in the User-Agent header. This will
	99	appear in parenthesis after the agent name. Custom dictates that this
	100	should be a URL of a page explaining the purpose and behavior of this
	101	crawler.
	102	</description>
	103	</property>
	104
	105	<property>
	106	<name>http.agent.email</name>
	107	<value></value>
	108	<description>An email address to advertise in the HTTP 'From' request
	109	header and User-Agent header. A good practice is to mangle this
	110	address (e.g. 'info at example dot com') to avoid spamming.
	111	</description>
	112	</property>
	113
	114	<property>
	115	<name>http.agent.version</name>
	116	<value>Nutch-1.0</value>
	117	<description>A version string to advertise in the User-Agent
	118	header.</description>
	119	</property>
	120
	121	<property>
	122	<name>http.agent.host</name>
	123	<value></value>
	124	<description>Name or IP address of the host on which the Nutch crawler
	125	would be running. Currently this is used by 'protocol-httpclient'
	126	plugin.
	127	</description>
	128	</property>
	129
	130	<property>
	131	<name>http.timeout</name>
	132	<value>10000</value>
	133	<description>The default network timeout, in milliseconds.</description>
	134	</property>
	135
	136	<property>
	137	<name>http.max.delays</name>
	138	<value>100</value>
	139	<description>The number of times a thread will delay when trying to
	140	fetch a page. Each time it finds that a host is busy, it will wait
	141	fetcher.server.delay. After http.max.delays attepts, it will give
	142	up on the page for now.</description>
	143	</property>
	144
	145	<property>
	146	<name>http.content.limit</name>
	147	<value>65536</value>
	148	<description>The length limit for downloaded content, in bytes.
	149	If this value is nonnegative (>=0), content longer than it will be truncated;
	150	otherwise, no truncation at all.
	151	</description>
	152	</property>
	153
	154	<property>
	155	<name>http.proxy.host</name>
	156	<value></value>
	157	<description>The proxy hostname. If empty, no proxy is used.</description>
	158	</property>
	159
	160	<property>
	161	<name>http.proxy.port</name>
	162	<value></value>
	163	<description>The proxy port.</description>
	164	</property>
	165
	166	<property>
	167	<name>http.proxy.username</name>
	168	<value></value>
	169	<description>Username for proxy. This will be used by
	170	'protocol-httpclient', if the proxy server requests basic, digest
	171	and/or NTLM authentication. To use this, 'protocol-httpclient' must
	172	be present in the value of 'plugin.includes' property.
	173	NOTE: For NTLM authentication, do not prefix the username with the
	174	domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect.
	175	</description>
	176	</property>
	177
	178	<property>
	179	<name>http.proxy.password</name>
	180	<value></value>
	181	<description>Password for proxy. This will be used by
	182	'protocol-httpclient', if the proxy server requests basic, digest
	183	and/or NTLM authentication. To use this, 'protocol-httpclient' must
	184	be present in the value of 'plugin.includes' property.
	185	</description>
	186	</property>
	187
	188	<property>
	189	<name>http.proxy.realm</name>
	190	<value></value>
	191	<description>Authentication realm for proxy. Do not define a value
	192	if realm is not required or authentication should take place for any
	193	realm. NTLM does not use the notion of realms. Specify the domain name
	194	of NTLM authentication as the value for this property. To use this,
	195	'protocol-httpclient' must be present in the value of
	196	'plugin.includes' property.
	197	</description>
	198	</property>
	199
	200	<property>
	201	<name>http.auth.file</name>
	202	<value>httpclient-auth.xml</value>
	203	<description>Authentication configuration file for
	204	'protocol-httpclient' plugin.
	205	</description>
	206	</property>
	207
	208	<property>
	209	<name>http.verbose</name>
	210	<value>false</value>
	211	<description>If true, HTTP will log more verbosely.</description>
	212	</property>
	213
	214	<property>
	215	<name>http.redirect.max</name>
	216	<value>0</value>
	217	<description>The maximum number of redirects the fetcher will follow when
	218	trying to fetch a page. If set to negative or 0, fetcher won't immediately
	219	follow redirected URLs, instead it will record them for later fetching.
	220	</description>
	221	</property>
	222
	223	<property>
	224	<name>http.useHttp11</name>
	225	<value>false</value>
	226	<description>NOTE: at the moment this works only for protocol-httpclient.
	227	If true, use HTTP 1.1, if false use HTTP 1.0 .
	228	</description>
	229	</property>
	230
	231	<!-- FTP properties -->
	232
	233	<property>
	234	<name>ftp.username</name>
	235	<value>anonymous</value>
	236	<description>ftp login username.</description>
	237	</property>
	238
	239	<property>
	240	<name>ftp.password</name>
	241	<value>anonymous@example.com</value>
	242	<description>ftp login password.</description>
	243	</property>
	244
	245	<property>
	246	<name>ftp.content.limit</name>
	247	<value>65536</value>
	248	<description>The length limit for downloaded content, in bytes.
	249	If this value is nonnegative (>=0), content longer than it will be truncated;
	250	otherwise, no truncation at all.
	251	Caution: classical ftp RFCs never defines partial transfer and, in fact,
	252	some ftp servers out there do not handle client side forced close-down very
	253	well. Our implementation tries its best to handle such situations smoothly.
	254	</description>
	255	</property>
	256
	257	<property>
	258	<name>ftp.timeout</name>
	259	<value>60000</value>
	260	<description>Default timeout for ftp client socket, in millisec.
	261	Please also see ftp.keep.connection below.</description>
	262	</property>
	263
	264	<property>
	265	<name>ftp.server.timeout</name>
	266	<value>100000</value>
	267	<description>An estimation of ftp server idle time, in millisec.
	268	Typically it is 120000 millisec for many ftp servers out there.
	269	Better be conservative here. Together with ftp.timeout, it is used to
	270	decide if we need to delete (annihilate) current ftp.client instance and
	271	force to start another ftp.client instance anew. This is necessary because
	272	a fetcher thread may not be able to obtain next request from queue in time
	273	(due to idleness) before our ftp client times out or remote server
	274	disconnects. Used only when ftp.keep.connection is true (please see below).
	275	</description>
	276	</property>
	277
	278	<property>
	279	<name>ftp.keep.connection</name>
	280	<value>false</value>
	281	<description>Whether to keep ftp connection. Useful if crawling same host
	282	again and again. When set to true, it avoids connection, login and dir list
	283	parser setup for subsequent urls. If it is set to true, however, you must
	284	make sure (roughly):
	285	(1) ftp.timeout is less than ftp.server.timeout
	286	(2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay)
	287	Otherwise there will be too many "delete client because idled too long"
	288	messages in thread logs.</description>
	289	</property>
	290
	291	<property>
	292	<name>ftp.follow.talk</name>
	293	<value>false</value>
	294	<description>Whether to log dialogue between our client and remote
	295	server. Useful for debugging.</description>
	296	</property>
	297
	298	<!-- web db properties -->
	299
	300	<property>
	301	<name>db.default.fetch.interval</name>
	302	<value>30</value>
	303	<description>(DEPRECATED) The default number of days between re-fetches of a page.
	304	</description>
	305	</property>
	306
	307	<property>
	308	<name>db.fetch.interval.default</name>
	309	<value>2592000</value>
	310	<description>The default number of seconds between re-fetches of a page (30 days).
	311	</description>
	312	</property>
	313
	314	<property>
	315	<name>db.fetch.interval.max</name>
	316	<value>7776000</value>
	317	<description>The maximum number of seconds between re-fetches of a page
	318	(90 days). After this period every page in the db will be re-tried, no
	319	matter what is its status.
	320	</description>
	321	</property>
	322
	323	<property>
	324	<name>db.fetch.schedule.class</name>
	325	<value>org.apache.nutch.crawl.DefaultFetchSchedule</value>
	326	<description>The implementation of fetch schedule. DefaultFetchSchedule simply
	327	adds the original fetchInterval to the last fetch time, regardless of
	328	page changes.</description>
	329	</property>
	330
	331	<property>
	332	<name>db.fetch.schedule.adaptive.inc_rate</name>
	333	<value>0.4</value>
	334	<description>If a page is unmodified, its fetchInterval will be
	335	increased by this rate. This value should not
	336	exceed 0.5, otherwise the algorithm becomes unstable.</description>
	337	</property>
	338
	339	<property>
	340	<name>db.fetch.schedule.adaptive.dec_rate</name>
	341	<value>0.2</value>
	342	<description>If a page is modified, its fetchInterval will be
	343	decreased by this rate. This value should not
	344	exceed 0.5, otherwise the algorithm becomes unstable.</description>
	345	</property>
	346
	347	<property>
	348	<name>db.fetch.schedule.adaptive.min_interval</name>
	349	<value>60.0</value>
	350	<description>Minimum fetchInterval, in seconds.</description>
	351	</property>
	352
	353	<property>
	354	<name>db.fetch.schedule.adaptive.max_interval</name>
	355	<value>31536000.0</value>
	356	<description>Maximum fetchInterval, in seconds (365 days).
	357	NOTE: this is limited by db.fetch.interval.max. Pages with
	358	fetchInterval larger than db.fetch.interval.max
	359	will be fetched anyway.</description>
	360	</property>
	361
	362	<property>
	363	<name>db.fetch.schedule.adaptive.sync_delta</name>
	364	<value>true</value>
	365	<description>If true, try to synchronize with the time of page change.
	366	by shifting the next fetchTime by a fraction (sync_rate) of the difference
	367	between the last modification time, and the last fetch time.</description>
	368	</property>
	369
	370	<property>
	371	<name>db.fetch.schedule.adaptive.sync_delta_rate</name>
	372	<value>0.3</value>
	373	<description>See sync_delta for description. This value should not
	374	exceed 0.5, otherwise the algorithm becomes unstable.</description>
	375	</property>
	376
	377	<property>
	378	<name>db.update.additions.allowed</name>
	379	<value>true</value>
	380	<description>If true, updatedb will add newly discovered URLs, if false
	381	only already existing URLs in the CrawlDb will be updated and no new
	382	URLs will be added.
	383	</description>
	384	</property>
	385
	386	<property>
	387	<name>db.ignore.internal.links</name>
	388	<value>true</value>
	389	<description>If true, when adding new links to a page, links from
	390	the same host are ignored. This is an effective way to limit the
	391	size of the link database, keeping only the highest quality
	392	links.
	393	</description>
	394	</property>
	395
	396	<property>
	397	<name>db.ignore.external.links</name>
	398	<value>false</value>
	399	<description>If true, outlinks leading from a page to external hosts
	400	will be ignored. This is an effective way to limit the crawl to include
	401	only initially injected hosts, without creating complex URLFilters.
	402	</description>
	403	</property>
	404
	405	<property>
	406	<name>db.score.injected</name>
	407	<value>1.0</value>
	408	<description>The score of new pages added by the injector.
	409	</description>
	410	</property>
	411
	412	<property>
	413	<name>db.score.link.external</name>
	414	<value>1.0</value>
	415	<description>The score factor for new pages added due to a link from
	416	another host relative to the referencing page's score. Scoring plugins
	417	may use this value to affect initial scores of external links.
	418	</description>
	419	</property>
	420
	421	<property>
	422	<name>db.score.link.internal</name>
	423	<value>1.0</value>
	424	<description>The score factor for pages added due to a link from the
	425	same host, relative to the referencing page's score. Scoring plugins
	426	may use this value to affect initial scores of internal links.
	427	</description>
	428	</property>
	429
	430	<property>
	431	<name>db.score.count.filtered</name>
	432	<value>false</value>
	433	<description>The score value passed to newly discovered pages is
	434	calculated as a fraction of the original page score divided by the
	435	number of outlinks. If this option is false, only the outlinks that passed
	436	URLFilters will count, if it's true then all outlinks will count.
	437	</description>
	438	</property>
	439
	440	<property>
	441	<name>db.max.inlinks</name>
	442	<value>10000</value>
	443	<description>Maximum number of Inlinks per URL to be kept in LinkDb.
	444	If "invertlinks" finds more inlinks than this number, only the first
	445	N inlinks will be stored, and the rest will be discarded.
	446	</description>
	447	</property>
	448
	449	<property>
	450	<name>db.max.outlinks.per.page</name>
	451	<value>100</value>
	452	<description>The maximum number of outlinks that we'll process for a page.
	453	If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks
	454	will be processed for a page; otherwise, all outlinks will be processed.
	455	</description>
	456	</property>
	457
	458	<property>
	459	<name>db.max.anchor.length</name>
	460	<value>100</value>
	461	<description>The maximum number of characters permitted in an anchor.
	462	</description>
	463	</property>
	464
	465	<property>
	466	<name>db.fetch.retry.max</name>
	467	<value>3</value>
	468	<description>The maximum number of times a url that has encountered
	469	recoverable errors is generated for fetch.</description>
	470	</property>
	471
	472	<property>
	473	<name>db.signature.class</name>
	474	<value>org.apache.nutch.crawl.MD5Signature</value>
	475	<description>The default implementation of a page signature. Signatures
	476	created with this implementation will be used for duplicate detection
	477	and removal.</description>
	478	</property>
	479
	480	<property>
	481	<name>db.signature.text_profile.min_token_len</name>
	482	<value>2</value>
	483	<description>Minimum token length to be included in the signature.
	484	</description>
	485	</property>
	486
	487	<property>
	488	<name>db.signature.text_profile.quant_rate</name>
	489	<value>0.01</value>
	490	<description>Profile frequencies will be rounded down to a multiple of
	491	QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token
	492	frequency. If maxFreq > 1 then QUANT will be at least 2, which means that
	493	for longer texts tokens with frequency 1 will always be discarded.
	494	</description>
	495	</property>
	496
	497	<!-- generate properties -->
	498
	499	<property>
	500	<name>generate.max.per.host</name>
	501	<value>-1</value>
	502	<description>The maximum number of urls per host in a single
	503	fetchlist. -1 if unlimited.</description>
	504	</property>
	505
	506	<property>
	507	<name>generate.max.per.host.by.ip</name>
	508	<value>false</value>
	509	<description>If false, same host names are counted. If true,
	510	hosts' IP addresses are resolved and the same IP-s are counted.
	511
	512	-+-+-+- WARNING !!! -+-+-+-
	513	When set to true, Generator will create a lot of DNS lookup
	514	requests, rapidly. This may cause a DOS attack on
	515	remote DNS servers, not to mention increased external traffic
	516	and latency. For these reasons when using this option it is
	517	required that a local caching DNS be used.</description>
	518	</property>
	519
	520	<property>
	521	<name>generate.update.crawldb</name>
	522	<value>false</value>
	523	<description>For highly-concurrent environments, where several
	524	generate/fetch/update cycles may overlap, setting this to true ensures
	525	that generate will create different fetchlists even without intervening
	526	updatedb-s, at the cost of running an additional job to update CrawlDB.
	527	If false, running generate twice without intervening
	528	updatedb will generate identical fetchlists.</description>
	529	</property>
	530
	531	<!-- fetcher properties -->
	532
	533	<property>
	534	<name>fetcher.server.delay</name>
	535	<value>5.0</value>
	536	<description>The number of seconds the fetcher will delay between
	537	successive requests to the same server.</description>
	538	</property>
	539
	540	<property>
	541	<name>fetcher.server.min.delay</name>
	542	<value>0.0</value>
	543	<description>The minimum number of seconds the fetcher will delay between
	544	successive requests to the same server. This value is applicable ONLY
	545	if fetcher.threads.per.host is greater than 1 (i.e. the host blocking
	546	is turned off).</description>
	547	</property>
	548
	549	<property>
	550	<name>fetcher.max.crawl.delay</name>
	551	<value>30</value>
	552	<description>
	553	If the Crawl-Delay in robots.txt is set to greater than this value (in
	554	seconds) then the fetcher will skip this page, generating an error report.
	555	If set to -1 the fetcher will never skip such pages and will wait the
	556	amount of time retrieved from robots.txt Crawl-Delay, however long that
	557	might be.
	558	</description>
	559	</property>
	560
	561	<property>
	562	<name>fetcher.threads.fetch</name>
	563	<value>10</value>
	564	<description>The number of FetcherThreads the fetcher should use.
	565	This is also determines the maximum number of requests that are
	566	made at once (each FetcherThread handles one connection).</description>
	567	</property>
	568
	569	<property>
	570	<name>fetcher.threads.per.host</name>
	571	<value>1</value>
	572	<description>This number is the maximum number of threads that
	573	should be allowed to access a host at one time.</description>
	574	</property>
	575
	576	<property>
	577	<name>fetcher.threads.per.host.by.ip</name>
	578	<value>true</value>
	579	<description>If true, then fetcher will count threads by IP address,
	580	to which the URL's host name resolves. If false, only host name will be
	581	used. NOTE: this should be set to the same value as
	582	"generate.max.per.host.by.ip" - default settings are different only for
	583	reasons of backward-compatibility.</description>
	584	</property>
	585
	586	<property>
	587	<name>fetcher.verbose</name>
	588	<value>false</value>
	589	<description>If true, fetcher will log more verbosely.</description>
	590	</property>
	591
	592	<property>
	593	<name>fetcher.parse</name>
	594	<value>true</value>
	595	<description>If true, fetcher will parse content.</description>
	596	</property>
	597
	598	<property>
	599	<name>fetcher.store.content</name>
	600	<value>true</value>
	601	<description>If true, fetcher will store content.</description>
	602	</property>
	603
	604	<!-- indexer properties -->
	605
	606	<property>
	607	<name>indexer.score.power</name>
	608	<value>0.5</value>
	609	<description>Determines the power of link analyis scores. Each
	610	pages's boost is set to <i>score<sup>scorePower</sup></i> where
	611	<i>score</i> is its link analysis score and <i>scorePower</i> is the
	612	value of this parameter. This is compiled into indexes, so, when
	613	this is changed, pages must be re-indexed for it to take
	614	effect.</description>
	615	</property>
	616
	617	<property>
	618	<name>indexer.max.title.length</name>
	619	<value>100</value>
	620	<description>The maximum number of characters of a title that are indexed.
	621	</description>
	622	</property>
	623
	624	<property>
	625	<name>indexer.max.tokens</name>
	626	<value>10000</value>
	627	<description>
	628	The maximum number of tokens that will be indexed for a single field
	629	in a document. This limits the amount of memory required for
	630	indexing, so that collections with very large files will not crash
	631	the indexing process by running out of memory.
	632
	633	Note that this effectively truncates large documents, excluding
	634	from the index tokens that occur further in the document. If you
	635	know your source documents are large, be sure to set this value
	636	high enough to accomodate the expected size. If you set it to
	637	-1, then the only limit is your memory, but you should anticipate
	638	an OutOfMemoryError.
	639	</description>
	640	</property>
	641
	642	<property>
	643	<name>indexer.mergeFactor</name>
	644	<value>50</value>
	645	<description>The factor that determines the frequency of Lucene segment
	646	merges. This must not be less than 2, higher values increase indexing
	647	speed but lead to increased RAM usage, and increase the number of
	648	open file handles (which may lead to "Too many open files" errors).
	649	NOTE: the "segments" here have nothing to do with Nutch segments, they
	650	are a low-level data unit used by Lucene.
	651	</description>
	652	</property>
	653
	654	<property>
	655	<name>indexer.minMergeDocs</name>
	656	<value>50</value>
	657	<description>This number determines the minimum number of Lucene
	658	Documents buffered in memory between Lucene segment merges. Larger
	659	values increase indexing speed and increase RAM usage.
	660	</description>
	661	</property>
	662
	663	<property>
	664	<name>indexer.maxMergeDocs</name>
	665	<value>2147483647</value>
	666	<description>This number determines the maximum number of Lucene
	667	Documents to be merged into a new Lucene segment. Larger values
	668	increase batch indexing speed and reduce the number of Lucene segments,
	669	which reduces the number of open file handles; however, this also
	670	decreases incremental indexing performance.
	671	</description>
	672	</property>
	673
	674	<property>
	675	<name>indexer.termIndexInterval</name>
	676	<value>128</value>
	677	<description>Determines the fraction of terms which Lucene keeps in
	678	RAM when searching, to facilitate random-access. Smaller values use
	679	more memory but make searches somewhat faster. Larger values use
	680	less memory but make searches somewhat slower.
	681	</description>
	682	</property>
	683
	684	<!-- indexingfilter plugin properties -->
	685
	686	<property>
	687	<name>indexingfilter.order</name>
	688	<value></value>
	689	<description>The order by which index filters are applied.
	690	If empty, all available index filters (as dictated by properties
	691	plugin-includes and plugin-excludes above) are loaded and applied in system
	692	defined order. If not empty, only named filters are loaded and applied
	693	in given order. For example, if this property has value:
	694	org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter
	695	then BasicIndexingFilter is applied first, and MoreIndexingFilter second.
	696
	697	Filter ordering might have impact on result if one filter depends on output of
	698	another filter.
	699	</description>
	700	</property>
	701
	702
	703	<!-- analysis properties -->
	704
	705	<property>
	706	<name>analysis.common.terms.file</name>
	707	<value>common-terms.utf8</value>
	708	<description>The name of a file containing a list of common terms
	709	that should be indexed in n-grams.</description>
	710	</property>
	711
	712	<!-- searcher properties -->
	713
	714	<property>
	715	<name>searcher.dir</name>
	716	<value>crawl</value>
	717	<description>
	718	Path to root of crawl. This directory is searched (in
	719	order) for either the file search-servers.txt, containing a list of
	720	distributed search servers, or the directory "index" containing
	721	merged indexes, or the directory "segments" containing segment
	722	indexes.
	723	</description>
	724	</property>
	725
	726	<property>
	727	<name>searcher.filter.cache.size</name>
	728	<value>16</value>
	729	<description>
	730	Maximum number of filters to cache. Filters can accelerate certain
	731	field-based queries, like language, document format, etc. Each
	732	filter requires one bit of RAM per page. So, with a 10 million page
	733	index, a cache size of 16 consumes two bytes per page, or 20MB.
	734	</description>
	735	</property>
	736
	737	<property>
	738	<name>searcher.filter.cache.threshold</name>
	739	<value>0.05</value>
	740	<description>
	741	Filters are cached when their term is matched by more than this
	742	fraction of pages. For example, with a threshold of 0.05, and 10
	743	million pages, the term must match more than 1/20, or 50,000 pages.
	744	So, if out of 10 million pages, 50% of pages are in English, and 2%
	745	are in Finnish, then, with a threshold of 0.05, searches for
	746	"lang:en" will use a cached filter, while searches for "lang:fi"
	747	will score all 20,000 finnish documents.
	748	</description>
	749	</property>
	750
	751	<property>
	752	<name>searcher.hostgrouping.rawhits.factor</name>
	753	<value>2.0</value>
	754	<description>
	755	A factor that is used to determine the number of raw hits
	756	initially fetched, before host grouping is done.
	757	</description>
	758	</property>
	759
	760	<property>
	761	<name>searcher.summary.context</name>
	762	<value>5</value>
	763	<description>
	764	The number of context terms to display preceding and following
	765	matching terms in a hit summary.
	766	</description>
	767	</property>
	768
	769	<property>
	770	<name>searcher.summary.length</name>
	771	<value>20</value>
	772	<description>
	773	The total number of terms to display in a hit summary.
	774	</description>
	775	</property>
	776
	777	<property>
	778	<name>searcher.max.hits</name>
	779	<value>-1</value>
	780	<description>If positive, search stops after this many hits are
	781	found. Setting this to small, positive values (e.g., 1000) can make
	782	searches much faster. With a sorted index, the quality of the hits
	783	suffers little.</description>
	784	</property>
	785
	786	<property>
	787	<name>searcher.max.time.tick_count</name>
	788	<value>-1</value>
	789	<description>If positive value is defined here, limit search time for
	790	every request to this number of elapsed ticks (see the tick_length
	791	property below). The total maximum time for any search request will be
	792	then limited to tick_count * tick_length milliseconds. When search time
	793	is exceeded, partial results will be returned, and the total number of
	794	hits will be estimated.
	795	</description>
	796	</property>
	797
	798	<property>
	799	<name>searcher.max.time.tick_length</name>
	800	<value>200</value>
	801	<description>The number of milliseconds between ticks. Larger values
	802	reduce the timer granularity (precision). Smaller values bring more
	803	overhead.
	804	</description>
	805	</property>
	806
	807	<property>
	808	<name>searcher.num.handlers</name>
	809	<value>10</value>
	810	<description>The number of handlers for the distributed search server.
	811	</description>
	812	</property>
	813
	814	<property>
	815	<name>searcher.max.hits.per.page</name>
	816	<value>1000</value>
	817	<description> The maximum number of hits to show per page. -1 if
	818	unlimited. If the number of hits requested by the user (via
	819	hitsPerPage parameter in the query string) is more than the value
	820	specified in this property, then this value is assumed as the number
	821	of hits per page.
	822	</description>
	823	</property>
	824
	825	<!-- URL normalizer properties -->
	826
	827	<property>
	828	<name>urlnormalizer.order</name>
	829	<value>org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer</value>
	830	<description>Order in which normalizers will run. If any of these isn't
	831	activated it will be silently skipped. If other normalizers not on the
	832	list are activated, they will run in random order after the ones
	833	specified here are run.
	834	</description>
	835	</property>
	836
	837	<property>
	838	<name>urlnormalizer.regex.file</name>
	839	<value>regex-normalize.xml</value>
	840	<description>Name of the config file used by the RegexUrlNormalizer class.
	841	</description>
	842	</property>
	843
	844	<property>
	845	<name>urlnormalizer.loop.count</name>
	846	<value>1</value>
	847	<description>Optionally loop through normalizers several times, to make
	848	sure that all transformations have been performed.
	849	</description>
	850	</property>
	851
	852	<!-- mime properties -->
	853
	854	<property>
	855	<name>mime.types.file</name>
	856	<value>tika-mimetypes.xml</value>
	857	<description>Name of file in CLASSPATH containing filename extension and
	858	magic sequence to mime types mapping information</description>
	859	</property>
	860
	861	<property>
	862	<name>mime.type.magic</name>
	863	<value>true</value>
	864	<description>Defines if the mime content type detector uses magic resolution.
	865	</description>
	866	</property>
	867
	868	<!-- plugin properties -->
	869
	870	<property>
	871	<name>plugin.folders</name>
	872	<value>plugins</value>
	873	<description>Directories where nutch plugins are located. Each
	874	element may be a relative or absolute path. If absolute, it is used
	875	as is. If relative, it is searched for on the classpath.</description>
	876	</property>
	877
	878	<property>
	879	<name>plugin.auto-activation</name>
	880	<value>true</value>
	881	<description>Defines if some plugins that are not activated regarding
	882	the plugin.includes and plugin.excludes properties must be automaticaly
	883	activated if they are needed by some actived plugins.
	884	</description>
	885	</property>
	886
	887	<property>
	888	<name>plugin.includes</name>
	889	<value>protocol-http\|urlfilter-regex\|parse-(text\|html\|js)\|index-(basic\|anchor)\|query-(basic\|site\|url)\|response-(json\|xml)\|summary-basic\|scoring-opic\|urlnormalizer-(pass\|regex\|basic)</value>
	890	<description>Regular expression naming plugin directory names to
	891	include. Any plugin not matching this expression is excluded.
	892	In any case you need at least include the nutch-extensionpoints plugin. By
	893	default Nutch includes crawling just HTML and plain text via HTTP,
	894	and basic indexing and search plugins. In order to use HTTPS please enable
	895	protocol-httpclient, but be aware of possible intermittent problems with the
	896	underlying commons-httpclient library.
	897	</description>
	898	</property>
	899
	900	<property>
	901	<name>plugin.excludes</name>
	902	<value></value>
	903	<description>Regular expression naming plugin directory names to exclude.
	904	</description>
	905	</property>
	906
	907	<!-- parser properties -->
	908
	909	<property>
	910	<name>parse.plugin.file</name>
	911	<value>parse-plugins.xml</value>
	912	<description>The name of the file that defines the associations between
	913	content-types and parsers.</description>
	914	</property>
	915
	916	<property>
	917	<name>parser.character.encoding.default</name>
	918	<value>windows-1252</value>
	919	<description>The character encoding to fall back to when no other information
	920	is available</description>
	921	</property>
	922
	923	<property>
	924	<name>encodingdetector.charset.min.confidence</name>
	925	<value>-1</value>
	926	<description>A integer between 0-100 indicating minimum confidence value
	927	for charset auto-detection. Any negative value disables auto-detection.
	928	</description>
	929	</property>
	930
	931	<property>
	932	<name>parser.caching.forbidden.policy</name>
	933	<value>content</value>
	934	<description>If a site (or a page) requests through its robot metatags
	935	that it should not be shown as cached content, apply this policy. Currently
	936	three keywords are recognized: "none" ignores any "noarchive" directives.
	937	"content" doesn't show the content, but shows summaries (snippets).
	938	"all" doesn't show either content or summaries.</description>
	939	</property>
	940
	941
	942	<property>
	943	<name>parser.html.impl</name>
	944	<value>neko</value>
	945	<description>HTML Parser implementation. Currently the following keywords
	946	are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup.
	947	</description>
	948	</property>
	949
	950	<property>
	951	<name>parser.html.form.use_action</name>
	952	<value>false</value>
	953	<description>If true, HTML parser will collect URLs from form action
	954	attributes. This may lead to undesirable behavior (submitting empty
	955	forms during next fetch cycle). If false, form action attribute will
	956	be ignored.</description>
	957	</property>
	958
	959	<property>
	960	<name>parser.html.outlinks.ignore_tags</name>
	961	<value></value>
	962	<description>Comma separated list of HTML tags, from which outlinks
	963	shouldn't be extracted. Nutch takes links from: a, area, form, frame,
	964	iframe, script, link, img. If you add any of those tags here, it
	965	won't be taken. Default is empty list. Probably reasonable value
	966	for most people would be "img,script,link".</description>
	967	</property>
	968
	969
	970	<!-- urlfilter plugin properties -->
	971
	972	<property>
	973	<name>urlfilter.domain.file</name>
	974	<value>domain-urlfilter.txt</value>
	975	<description>Name of file on CLASSPATH containing either top level domains or
	976	hostnames used by urlfilter-domain (DomainURLFilter) plugin.</description>
	977	</property>
	978
	979	<property>
	980	<name>urlfilter.regex.file</name>
	981	<value>regex-urlfilter.txt</value>
	982	<description>Name of file on CLASSPATH containing regular expressions
	983	used by urlfilter-regex (RegexURLFilter) plugin.</description>
	984	</property>
	985
	986	<property>
	987	<name>urlfilter.automaton.file</name>
	988	<value>automaton-urlfilter.txt</value>
	989	<description>Name of file on CLASSPATH containing regular expressions
	990	used by urlfilter-automaton (AutomatonURLFilter) plugin.</description>
	991	</property>
	992
	993	<property>
	994	<name>urlfilter.prefix.file</name>
	995	<value>prefix-urlfilter.txt</value>
	996	<description>Name of file on CLASSPATH containing url prefixes
	997	used by urlfilter-prefix (PrefixURLFilter) plugin.</description>
	998	</property>
	999
	1000	<property>
	1001	<name>urlfilter.suffix.file</name>
	1002	<value>suffix-urlfilter.txt</value>
	1003	<description>Name of file on CLASSPATH containing url suffixes
	1004	used by urlfilter-suffix (SuffixURLFilter) plugin.</description>
	1005	</property>
	1006
	1007	<property>
	1008	<name>urlfilter.order</name>
	1009	<value></value>
	1010	<description>The order by which url filters are applied.
	1011	If empty, all available url filters (as dictated by properties
	1012	plugin-includes and plugin-excludes above) are loaded and applied in system
	1013	defined order. If not empty, only named filters are loaded and applied
	1014	in given order. For example, if this property has value:
	1015	org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter
	1016	then RegexURLFilter is applied first, and PrefixURLFilter second.
	1017	Since all filters are AND'ed, filter ordering does not have impact
	1018	on end result, but it may have performance implication, depending
	1019	on relative expensiveness of filters.
	1020	</description>
	1021	</property>
	1022
	1023	<!-- scoring filters properties -->
	1024
	1025	<property>
	1026	<name>scoring.filter.order</name>
	1027	<value></value>
	1028	<description>The order in which scoring filters are applied.
	1029	This may be left empty (in which case all available scoring
	1030	filters will be applied in the order defined in plugin-includes
	1031	and plugin-excludes), or a space separated list of implementation
	1032	classes.
	1033	</description>
	1034	</property>
	1035
	1036	<!-- clustering extension properties -->
	1037
	1038	<property>
	1039	<name>extension.clustering.hits-to-cluster</name>
	1040	<value>100</value>
	1041	<description>Number of snippets retrieved for the clustering extension
	1042	if clustering extension is available and user requested results
	1043	to be clustered.</description>
	1044	</property>
	1045
	1046	<property>
	1047	<name>extension.clustering.extension-name</name>
	1048	<value></value>
	1049	<description>Use the specified online clustering extension. If empty,
	1050	the first available extension will be used. The "name" here refers to an 'id'
	1051	attribute of the 'implementation' element in the plugin descriptor XML
	1052	file.</description>
	1053	</property>
	1054
	1055	<!-- ontology extension properties -->
	1056
	1057	<property>
	1058	<name>extension.ontology.extension-name</name>
	1059	<value></value>
	1060	<description>Use the specified online ontology extension. If empty,
	1061	the first available extension will be used. The "name" here refers to an 'id'
	1062	attribute of the 'implementation' element in the plugin descriptor XML
	1063	file.</description>
	1064	</property>
	1065
	1066	<property>
	1067	<name>extension.ontology.urls</name>
	1068	<value>
	1069	</value>
	1070	<description>Urls of owl files, separated by spaces, such as
	1071	http://www.example.com/ontology/time.owl
	1072	http://www.example.com/ontology/space.owl
	1073	http://www.example.com/ontology/wine.owl
	1074	Or
	1075	file:/ontology/time.owl
	1076	file:/ontology/space.owl
	1077	file:/ontology/wine.owl
	1078	You have to make sure each url is valid.
	1079	By default, there is no owl file, so query refinement based on ontology
	1080	is silently ignored.
	1081	</description>
	1082	</property>
	1083
	1084	<!-- query-basic plugin properties -->
	1085
	1086	<property>
	1087	<name>query.url.boost</name>
	1088	<value>4.0</value>
	1089	<description> Used as a boost for url field in Lucene query.
	1090	</description>
	1091	</property>
	1092
	1093	<property>
	1094	<name>query.anchor.boost</name>
	1095	<value>2.0</value>
	1096	<description> Used as a boost for anchor field in Lucene query.
	1097	</description>
	1098	</property>
	1099
	1100	<property>
	1101	<name>query.title.boost</name>
	1102	<value>1.5</value>
	1103	<description> Used as a boost for title field in Lucene query.
	1104	</description>
	1105	</property>
	1106
	1107	<property>
	1108	<name>query.host.boost</name>
	1109	<value>2.0</value>
	1110	<description> Used as a boost for host field in Lucene query.
	1111	</description>
	1112	</property>
	1113
	1114	<property>
	1115	<name>query.phrase.boost</name>
	1116	<value>1.0</value>
	1117	<description> Used as a boost for phrase in Lucene query.
	1118	Multiplied by boost for field phrase is matched in.
	1119	</description>
	1120	</property>
	1121
	1122	<!--
	1123	<property>
	1124	<name>query.basic.description.boost</name>
	1125	<value>1.0</value>
	1126	<description> Declares a custom field and its boost to be added to the default fields of the Lucene query.
	1127	</description>
	1128	</property>
	1129	-->
	1130
	1131	<!-- creative-commons plugin properties -->
	1132
	1133	<property>
	1134	<name>query.cc.boost</name>
	1135	<value>0.0</value>
	1136	<description> Used as a boost for cc field in Lucene query.
	1137	</description>
	1138	</property>
	1139
	1140	<!-- query-more plugin properties -->
	1141
	1142	<property>
	1143	<name>query.type.boost</name>
	1144	<value>0.0</value>
	1145	<description> Used as a boost for type field in Lucene query.
	1146	</description>
	1147	</property>
	1148
	1149	<!-- query-site plugin properties -->
	1150
	1151	<property>
	1152	<name>query.site.boost</name>
	1153	<value>0.0</value>
	1154	<description> Used as a boost for site field in Lucene query.
	1155	</description>
	1156	</property>
	1157
	1158	<!-- microformats-reltag plugin properties -->
	1159
	1160	<property>
	1161	<name>query.tag.boost</name>
	1162	<value>1.0</value>
	1163	<description> Used as a boost for tag field in Lucene query.
	1164	</description>
	1165	</property>
	1166
	1167	<!-- language-identifier plugin properties -->
	1168
	1169	<property>
	1170	<name>lang.ngram.min.length</name>
	1171	<value>1</value>
	1172	<description> The minimum size of ngrams to uses to identify
	1173	language (must be between 1 and lang.ngram.max.length).
	1174	The larger is the range between lang.ngram.min.length and
	1175	lang.ngram.max.length, the better is the identification, but
	1176	the slowest it is.
	1177	</description>
	1178	</property>
	1179
	1180	<property>
	1181	<name>lang.ngram.max.length</name>
	1182	<value>4</value>
	1183	<description> The maximum size of ngrams to uses to identify
	1184	language (must be between lang.ngram.min.length and 4).
	1185	The larger is the range between lang.ngram.min.length and
	1186	lang.ngram.max.length, the better is the identification, but
	1187	the slowest it is.
	1188	</description>
	1189	</property>
	1190
	1191	<property>
	1192	<name>lang.analyze.max.length</name>
	1193	<value>2048</value>
	1194	<description> The maximum bytes of data to uses to indentify
	1195	the language (0 means full content analysis).
	1196	The larger is this value, the better is the analysis, but the
	1197	slowest it is.
	1198	</description>
	1199	</property>
	1200
	1201	<property>
	1202	<name>query.lang.boost</name>
	1203	<value>0.0</value>
	1204	<description> Used as a boost for lang field in Lucene query.
	1205	</description>
	1206	</property>
	1207
	1208	<!-- Temporary Hadoop 0.17.x workaround. -->
	1209
	1210	<property>
	1211	<name>hadoop.job.history.user.location</name>
	1212	<value>${hadoop.log.dir}/history/user</value>
	1213	<description>Hadoop 0.17.x comes with a default setting to create
	1214	user logs inside the output path of the job. This breaks some
	1215	Hadoop classes, which expect the output to contain only
	1216	part-XXXXX files. This setting changes the output to a
	1217	subdirectory of the regular log directory.
	1218	</description>
	1219	</property>
	1220
	1221	<!-- response writer properties -->
	1222
	1223	<property>
	1224	<name>search.response.default.type</name>
	1225	<value>xml</value>
	1226	<description>
	1227	The default response type returned if none is specified.
	1228	</description>
	1229	</property>
	1230
	1231	<property>
	1232	<name>search.response.default.lang</name>
	1233	<value>en</value>
	1234	<description>
	1235	The default response language if none is specified.
	1236	</description>
	1237	</property>
	1238
	1239	<property>
	1240	<name>search.response.default.numrows</name>
	1241	<value>10</value>
	1242	<description>
	1243	The default number of rows to return if none is specified.
	1244	</description>
	1245	</property>
	1246
	1247	<property>
	1248	<name>search.response.default.dedupfield</name>
	1249	<value>site</value>
	1250	<description>
	1251	The default dedup field if none is specified.
	1252	</description>
	1253	</property>
	1254
	1255	<property>
	1256	<name>search.response.default.numdupes</name>
	1257	<value>1</value>
	1258	<description>
	1259	The default number of duplicates returned if none is specified.
	1260	</description>
	1261	</property>
	1262
	1263	<property>
	1264	<name>searcher.response.maxage</name>
	1265	<value>86400</value>
	1266	<description>
	1267	The maxage of a response in seconds. Used in caching headers.
	1268	</description>
	1269	</property>
	1270
	1271	<property>
	1272	<name>searcher.response.prettyprint</name>
	1273	<value>true</value>
	1274	<description>
	1275	Should the response output be pretty printed. Setting to true enables better
	1276	debugging, false removes unneeded spaces and gives better throughput.
	1277	</description>
	1278	</property>
	1279
	1280	</configuration>

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: nutchez-0.1/conf/nutch-default.xml @ 67

Download in other formats: